Filter example: split a single HTML document into multiple documents
Description
Filters allow documents to be split up, thus converting a single document into multiple documents which will be indexed as individual documents. Although in this example StringDocumentFilter is implemented, both ByteDocumentFilter and Filter can be used to produce multiple documents.
Example
In this example we split the input HTML document on the articles
, creating a new document for each article
in the original document. As we are creating new documents we must assign a new URI
for each new document, in this example a URI is provided from the source document. New documents are created by making clones of the orinal document, by doing this we preserve other parts of the document such as metadata. This example implements the StringDocumentFilter. We are required to implement canFilter(), used to check if the given document is a html
document, as well as filterAsStringDocument() which contains the logic for the filter.
This example also has a simple test method which can be executed by running the main method see testing Groovy filters.
package com.myfilters;
import java.net.URI;
import java.util.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;
@groovy.util.logging.Log4j2
public class SplitHTMLDocumentFilter implements StringDocumentFilter {
// Filter HTML
@Override
public PreFilterCheck canFilter(NoContentDocument document, FilterContext context) {
if(document.getDocumentType().isHTML()) {
return PreFilterCheck.ATTEMPT_FILTER;
}
return PreFilterCheck.SKIP_FILTER;
}
@Override
public FilterResult filterAsStringDocument(StringDocument document, FilterContext context) {
// Create a jsoup object from the string document
Document jsoupDoc = Jsoup.parse(document.getContentAsString(), document.getURI().toString());
// Look for all the artices which represents items which we intend to split on
ArrayList<FilterableDocument> docs = new ArrayList<>();
for(Element element : jsoupDoc.select("article")) {
// Get the url
URI uri = URI.create(element.select("a").attr("href"));
log.info("Creating document for article with URL: " + uri.toASCIIString());
// Create a basic html page
Document article = Jsoup.parse("<html><head></head><body></body></html>");
// Insert the article into the basic html page
article.body().html(element.html());
// Clone the existing document with the new URI and content, preserving all other
// attributes including meta data
docs.add(document.cloneWithURI(uri).cloneWithStringContent(document.getDocumentType(), article.html()));
}
return FilterResult.of(docs);
}
/*
* Below are filter test methods.
*/
public static class FilterTest {
@Test
public void splitArticlesTest() {
//Create a input document with two articles
StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
.cloneWithStringContent(DocumentType.MIME_HTML_TEXT,
"<html><head></head><body>\n"
+"<article>\n"
+"<a href=\"http://articale.com/one\">Article one</a> \n"
+"<p>Article one stuff</p>\n"
+"</article>\n"
+"<article>\n"
+"<a href=\"http://articale.com/two\">Article two</a> \n"
+"<p>Article two stuff</p>\n"
+"</article>\n"
+"</body></html>");
FilterResult filterResult = new SplitHTMLDocumentFilter().filter(inputDoc, MockFilterContext.getEmptyContext());
Assert.assertEquals("Should have returned two documents one for each article",
2, filterResult.getFilteredDocuments().size());
//Check article one
StringDocument articleOne = (StringDocument) filterResult.getFilteredDocuments().get(0);
Assert.assertEquals("Article one URL is wrong",
"http://articale.com/one", articleOne.getURI().toASCIIString());
Assert.assertTrue("Check article one has article one content",
articleOne.getContentAsString().contains("Article one stuff"));
Assert.assertFalse("Check article one does NOT have article two content",
articleOne.getContentAsString().contains("Article two stuff"));
//Check article two
StringDocument articleTwo = (StringDocument) filterResult.getFilteredDocuments().get(1);
Assert.assertEquals("Article two URL is wrong",
"http://articale.com/two", articleTwo.getURI().toASCIIString());
Assert.assertTrue("Check article two has article two content",
articleTwo.getContentAsString().contains("Article two stuff"));
Assert.assertFalse("Check article two does NOT have article one content",
articleTwo.getContentAsString().contains("Article one stuff"));
}
}
//Running the main method will execute the test methods.
public static void main(String[] args) throws Exception {
FilterTestRunner.runTests(FilterTest.class);
}
}