Skip to content

Filter example: binary document manipulation

Description

Filters allow manipulating documents at a byte level, which is particularly useful for binary documents such as pdf. In this example a filter is written that Base64 decodes all documents. It decodes the given bytes and returns a document with the decoded bytes. As we don't assume the document to be a String this filter could be given a Base64 encoded pdf and produce pdf bytes which could be filtered in another filter.

Example

This example implements the ByteDocumentFilter. We are required to implement canFilter(), used to check if the filter should be run, as well as filterAsBytesDocument() which contains the logic for the filter.

This example also has a simple test method simpleDecodeTest. The test can be executed by running the main method see testing Groovy filters.

package com.myfilters;

import static java.nio.charset.StandardCharsets.UTF_8;
import java.util.*;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;

@groovy.util.logging.Log4j2
public class Base64DecodingFilter implements BytesDocumentFilter {
  
  @Override
  public PreFilterCheck canFilter(NoContentDocument document, FilterContext context) {
    // Pre filter checks could be done here, we assume all documents are
    // Base64 encoded.
    return PreFilterCheck.ATTEMPT_FILTER;
  }

  @Override
  public FilterResult filterAsBytesDocument(BytesDocument document, FilterContext context) {
    // Gets the document contents as raw bytes and decode the bytes 
    byte[] decodedBytes = Base64.getDecoder().decode(document.getCopyOfContents());
    
    log.debug("Decoding resulted in a " + decodedBytes.length + " byte content for: " + document.getURI());
    
    // Make a clone of the document updating the content to be the decoded bytes.
    // We keep the existing document type and charset.
    BytesDocument newDocument = document.cloneWithContent(
      document.getDocumentType(),
      document.getCharset(),
      decodedBytes);
    return FilterResult.of(newDocument);
  }
  
  /*
   * Below are filter test methods. 
   */
  public static class FilterTest {
    @Test
    public void simpleDecodeTest() throws Exception {
      // Create the input document with base 64 encoded bytes.
      BytesDocument inputDoc = MockDocuments.mockEmptyByteDoc()
        .cloneWithContent(
        	DocumentType.MIME_UNKNOWN, 
        	Optional.empty(), 
        	"SGVsbG8gRnVubmVsYmFjaw==".getBytes(UTF_8));
  
      // Create and run the filter, getting the filtered result.
      FilterResult result = new Base64DecodingFilter()
        .filter(inputDoc, MockFilterContext.getEmptyContext());
  
      // Get the resulting document from filtering,
      // there should be exactly one document.
      BytesDocument document = (BytesDocument) result
      	.getFilteredDocuments().get(0);
  
      // Convert the document content to a string.
      String decodedString = new String(document.getCopyOfContents(), UTF_8);
  
      // Check that we correctly decoded the document.
      Assert.assertEquals(
      	"The content was not correctly decoded",
      	"Hello Funnelback",
      	decodedString);
    }
  }
  
  // Running the main method will execute the test methods.
  public static void main(String[] args) throws Exception {
    FilterTestRunner.runTests(FilterTest.class);
  }

}

See also:

top

Funnelback logo
v15.16.0