Skip to content

Filter example: remove a document

Description

Filters allow the removal of documents, this can be used to prevent a document from appearing in the index. Although in this example Filter is implemented, both ByteDocumentFilter and StringDocumentFilter can be used to remove documents.

Example

In this example we remove any document that looks like a sitemap URL. This example implements Filter which requires the Filter method to be implemented.

This example also has a simple test which can be executed by running the main method see testing Groovy filters.

package com.myfilters;

import java.net.URI;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;

@groovy.util.logging.Log4j2
public class RemoveSitemapXMLDocuments implements Filter {

    @Override
    public FilterResult filter(FilterableDocument document, FilterContext context) throws RuntimeException,
        FilterException {
        //Assume any URL path that ends with sitemap.xml is a site map URL. 
        if(document.getURI().getPath().endsWith("sitemap.xml")){
            //Returning this removes the document, when crawling the document will
            //not be stored.
            return FilterResult.delete();
        }
        
        log.debug("Keeping document with URL: " + document.getURI() + " as it is not a site map.");
        
        return FilterResult.of(document);
    }
    
    /*
     * Below are filter test methods. 
     */    
    public static class FilterTest {
        @Test
        public void removeSitemapTest() {
            StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
                                                    .cloneWithURI(URI.create("https://foo.com/sitemap.xml"));
            FilterResult filterResult = new RemoveSitemapXMLDocuments().filter(inputDoc, 
                                                                                MockFilterContext.getEmptyContext());
            
            Assert.assertEquals("No documents should have been returned as sitemap URLs should be removed", 
                                    0, filterResult.getFilteredDocuments().size());
        }
        
        @Test
        public void keepsNonSitemapDocumentsTest() {
            StringDocument inputDoc = MockDocuments.mockEmptyStringDoc()
                                                    .cloneWithURI(URI.create("https://foo.com/hello.html"));
            FilterResult filterResult = new RemoveSitemapXMLDocuments().filter(inputDoc, 
                                            MockFilterContext.getEmptyContext());

            Assert.assertEquals("One document should have been returned by the filter", 
                                    1, filterResult.getFilteredDocuments().size());
            
            Assert.assertEquals("We should have returned the original document without modification",
                                    inputDoc, filterResult.getFilteredDocuments().get(0));
        }
    }
    
    //Running the main method will execute the test methods.
    public static void main(String[] args) throws Exception {
        FilterTestRunner.runTests(FilterTest.class);
    }
}

See also:

top

Funnelback logo
v15.24.0