Filter example: binary document manipulation
Description
Filters allow manipulating documents at a byte level, which is particularly useful for binary documents such as pdf
. In this example a filter is written that Base64 decodes all documents. It decodes the given bytes and returns a document with the decoded bytes. As we don't assume the document to be a String this filter could be given a Base64 encoded pdf
and produce pdf
bytes which could be filtered in another filter.
Example
This example implements the ByteDocumentFilter. We are required to implement canFilter(), used to check if the filter should be run, as well as filterAsBytesDocument() which contains the logic for the filter.
This example also has a simple test method simpleDecodeTest
. The test can be executed by running the main method see testing Groovy filters.
package com.myfilters;
import static java.nio.charset.StandardCharsets.UTF_8;
import java.util.*;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;
@groovy.util.logging.Log4j2
public class Base64DecodingFilter implements BytesDocumentFilter {
@Override
public PreFilterCheck canFilter(NoContentDocument document, FilterContext context) {
// Pre filter checks could be done here, we assume all documents are
// Base64 encoded.
return PreFilterCheck.ATTEMPT_FILTER;
}
@Override
public FilterResult filterAsBytesDocument(BytesDocument document, FilterContext context) {
// Gets the document contents as raw bytes and decode the bytes
byte[] decodedBytes = Base64.getDecoder().decode(document.getCopyOfContents());
log.debug("Decoding resulted in a " + decodedBytes.length + " byte content for: " + document.getURI());
// Make a clone of the document updating the content to be the decoded bytes.
// We keep the existing document type and charset.
BytesDocument newDocument = document.cloneWithContent(
document.getDocumentType(),
document.getCharset(),
decodedBytes);
return FilterResult.of(newDocument);
}
/*
* Below are filter test methods.
*/
public static class FilterTest {
@Test
public void simpleDecodeTest() throws Exception {
// Create the input document with base 64 encoded bytes.
BytesDocument inputDoc = MockDocuments.mockEmptyByteDoc()
.cloneWithContent(
DocumentType.MIME_UNKNOWN,
Optional.empty(),
"SGVsbG8gRnVubmVsYmFjaw==".getBytes(UTF_8));
// Create and run the filter, getting the filtered result.
FilterResult result = new Base64DecodingFilter()
.filter(inputDoc, MockFilterContext.getEmptyContext());
// Get the resulting document from filtering,
// there should be exactly one document.
BytesDocument document = (BytesDocument) result
.getFilteredDocuments().get(0);
// Convert the document content to a string.
String decodedString = new String(document.getCopyOfContents(), UTF_8);
// Check that we correctly decoded the document.
Assert.assertEquals(
"The content was not correctly decoded",
"Hello Funnelback",
decodedString);
}
}
// Running the main method will execute the test methods.
public static void main(String[] args) throws Exception {
FilterTestRunner.runTests(FilterTest.class);
}
}