Skip to content

Filter example: split a single document into multiple documents

Description

Filters allow documents to be split up, thus converting a single document into multiple documents which will be indexed separately. Although in this example StringDocumentFilter is implemented, both ByteDocumentFilter and Filter can be used to produce multiple documents.

Example

In this example we split the input document into lines, create a new document for each line in the original document. As we are creating new documents we must assign a new URI for each new document, in this example we create a random URI for each new document. New documents are created by making clone of the original document, by doing this we preserve other parts of the document such as metadata. This example implements the StringDocumentFilter. We are required to implement canFilter(), used to check if the given document is a plain text document, as well as filterAsStringDocument() which contains the logic for the filter.

This example also has a simple test method which can be executed by running the main method see testing Groovy filters.

package com.myfilters;

import java.net.URI;
import java.util.*;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;

@groovy.util.logging.Log4j2
public class SplitLinesDocumentFilter implements StringDocumentFilter {

    @Override
    public PreFilterCheck canFilter(NoContentDocument document, FilterContext context) {
        //Only filter text documents.
        if(document.getDocumentType().asContentType().startsWith("text/plain")) {
            return PreFilterCheck.ATTEMPT_FILTER;
        }
        return PreFilterCheck.SKIP_FILTER;
    }

    @Override
    public FilterResult filterAsStringDocument(StringDocument document, FilterContext context) {
        List<StringDocument> documents = new ArrayList<>();
        
        //Split the given document into lines and create a new document for each line.
        for(String line : document.getContentAsString().split("\n")){
            //Create a random URL for our new document.
            String newUrl = "file://line/" + UUID.randomUUID().toString();
            
            log.info("Creating document with URL: " + newUrl);
            
            //Create a new document with the new url and the new line preserving 
            //all other attributes.
            StringDocument newDocument = document.cloneWithURI(URI.create(newUrl))
                                                  .cloneWithStringContent(document.getDocumentType(), line);
            documents.add(newDocument);
        }
        
        //Return all the documents we created.
        return FilterResult.of(documents);
    }
    
    /*
     * Below are filter test methods. 
     */
    public static class FilterTest {
        
        @Test
        public void multipleLinesTest() {
            //Create document with three lines.
            StringDocument stringDocument = MockDocuments.mockEmptyStringDoc()
                                                         .cloneWithStringContent(DocumentType.fromContentType("text/plain"), 
                                                             "line one\nline two\nline three");
            
            //Create and run the test.
            FilterResult filterResult = new SplitLinesDocumentFilter().filter(stringDocument, MockFilterContext.getEmptyContext());
            
            Assert.assertEquals("Should have produced three documents as the original document had 3 lines", 
                                    3, filterResult.getFilteredDocuments().size());
            
            List<FilterableDocument> results = filterResult.getFilteredDocuments();
            
            Assert.assertEquals("Check content of document one", 
                "line one", ((StringDocument) results.get(0)).getContentAsString());
            Assert.assertEquals("Check content document two",
                "line two", ((StringDocument) results.get(1)).getContentAsString());
            Assert.assertEquals("Check content document three",
                "line three", ((StringDocument) results.get(2)).getContentAsString());
            
            //Dump all URLs into a Set and confirm each document got a unique URL
            Set<String> urls = new HashSet<>();
            urls.add(results.get(0).getURI().toASCIIString());
            urls.add(results.get(1).getURI().toASCIIString());
            urls.add(results.get(2).getURI().toASCIIString());
            
            Assert.assertEquals("Each produced document should have a unique URL", 3, urls.size());
        }
        
    }
    
    //Running the main method will execute the test methods.
    public static void main(String[] args) throws Exception {
        FilterTestRunner.runTests(FilterTest.class);
    }

}

See also:

top

Funnelback logo
v15.16.0