Skip to content

Filter example: split a single HTML document into multiple documents

Description

Filters allow documents to be split up, thus converting a single document into multiple documents which will be indexed as individual documents. Although in this example StringDocumentFilter is implemented, both ByteDocumentFilter and Filter can be used to produce multiple documents.

Example

In this example we split the input HTML document on the articles, creating a new document for each article in the original document. As we are creating new documents we must assign a new URI for each new document, in this example a URI is provided from the source document. New documents are created by making clones of the orinal document, by doing this we preserve other parts of the document such as metadata. This example implements the StringDocumentFilter. We are required to implement canFilter(), used to check if the given document is a html document, as well as filterAsStringDocument() which contains the logic for the filter.

This example also has a simple test method which can be executed by running the main method see testing Groovy filters.

package com.myfilters;

import java.net.URI;
import java.util.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.DocumentType;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;

@groovy.util.logging.Log4j2
public class SplitHTMLDocumentFilter implements StringDocumentFilter {
    
    // Filter HTML
    @Override
    public PreFilterCheck canFilter(NoContentDocument document, FilterContext context) {
        if(document.getDocumentType().isHTML()) {
            return PreFilterCheck.ATTEMPT_FILTER;
        }
        return PreFilterCheck.SKIP_FILTER;
    }

    @Override
    public FilterResult filterAsStringDocument(StringDocument document, FilterContext context) {

        // Create a jsoup object from the string document
        Document jsoupDoc = Jsoup.parse(document.getContentAsString(), document.getURI().toString());

        // Look for all the artices which represents items which we intend to split on

        ArrayList<FilterableDocument> docs = new ArrayList<>();

        for(Element element : jsoupDoc.select("article")) {
            // Get the url
            URI uri = URI.create(element.select("a").attr("href"));


            log.info("Creating document for article with URL: " + uri.toASCIIString());

            // Create a basic html page
            Document article = Jsoup.parse("<html><head></head><body></body></html>");

            // Insert the article into the basic html page
            article.body().html(element.html());

            // Clone the existing document with the new URI and content, preserving all other
            // attributes including meta data
            docs.add(document.cloneWithURI(uri).cloneWithStringContent(document.getDocumentType(), article.html()));  
        }


        return FilterResult.of(docs);
    }
    
    /*
     * Below are filter test methods. 
     */
    public static class FilterTest {
        
        @Test
        public void splitArticlesTest() {
            //Create a input document with two articles
            StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
                           .cloneWithStringContent(DocumentType.MIME_HTML_TEXT, 
                               "<html><head></head><body>\n"
                                   +"<article>\n"
                                       +"<a href=\"http://articale.com/one\">Article one</a> \n"
                                       +"<p>Article one stuff</p>\n"
                                   +"</article>\n"
                                   +"<article>\n"
                                       +"<a href=\"http://articale.com/two\">Article two</a> \n"
                                       +"<p>Article two stuff</p>\n"
                                   +"</article>\n"
                                   +"</body></html>");
            
            FilterResult filterResult = new SplitHTMLDocumentFilter().filter(inputDoc, MockFilterContext.getEmptyContext());
            
            Assert.assertEquals("Should have returned two documents one for each article", 
                    2, filterResult.getFilteredDocuments().size());
            
            //Check article one
            StringDocument articleOne = (StringDocument) filterResult.getFilteredDocuments().get(0);
            
            Assert.assertEquals("Article one URL is wrong", 
                "http://articale.com/one", articleOne.getURI().toASCIIString());
            
            Assert.assertTrue("Check article one has article one content", 
                articleOne.getContentAsString().contains("Article one stuff"));
            
            Assert.assertFalse("Check article one does NOT have article two content", 
                articleOne.getContentAsString().contains("Article two stuff"));
            
            //Check article two
            StringDocument articleTwo = (StringDocument) filterResult.getFilteredDocuments().get(1);
            
            Assert.assertEquals("Article two URL is wrong", 
                "http://articale.com/two", articleTwo.getURI().toASCIIString());
            
            Assert.assertTrue("Check article two has article two content", 
                articleTwo.getContentAsString().contains("Article two stuff"));
            
            Assert.assertFalse("Check article two does NOT have article one content", 
                articleTwo.getContentAsString().contains("Article one stuff"));
        }
    }

    //Running the main method will execute the test methods.
    public static void main(String[] args) throws Exception {
        FilterTestRunner.runTests(FilterTest.class);
    }

}

See also:

top

Funnelback logo
v15.16.0