Skip to content

Filter example: binary to string conversion

Description

Filters can be used to convert from some binary form such as a pdf to a String containing HTML. Here we outline an example of being given a binary document and converting it to a String.

Example

In this example we simply convert all JSON documents from raw bytes to a String assuming the charset is UTF-8. This example implements the ByteDocumentFilter. We are required to implement canFilter(), used to check the document type is JSON to determine if the filter should run, as well as filterAsBytesDocument() which contains the logic for the filter.

This example also has two test methods which can be executed by running the main method see testing Groovy filters.

package com.myfilters;

import java.util.*;
import org.junit.*;
import org.junit.Test;
import com.funnelback.filter.api.*;
import com.funnelback.filter.api.documents.*;
import com.funnelback.filter.api.filters.*;
import com.funnelback.filter.api.mock.*;

@groovy.util.logging.Log4j2
public class JSONUTF8ConverterFilter implements BytesDocumentFilter {
  
  @Override
  public PreFilterCheck canFilter(NoContentDocument document, FilterContext context) {
    // We limit the filter to only run on JSON documents by checking
    // the document type, typically derived from the Content-Type header
    // returned by the web server.
    if(document.getDocumentType().isJSON()){
      log.debug(document.getURI() + " is JSON so the Filter will be attempted.");
      return PreFilterCheck.ATTEMPT_FILTER;
    }
    return PreFilterCheck.SKIP_FILTER;
  }

  @Override
  public FilterResult filterAsBytesDocument(BytesDocument document, FilterContext context) {
    // Get the document content as a byte array and convert it to a String
    // assuming the bytes are UTF-8
    byte[] documentContentAsBytes = document.getCopyOfContents();
    String contentAsString = new String(documentContentAsBytes, UTF_8);
    
    return FilterResult.of(StringDocument
      .from(document, DocumentType.APPLICATION_JSON_TEXT, contentAsString));
  }
  
  /*
   * Below are filter test methods
   */
  public static class FilterTest {
    
    @Test
    public void testJSONIsConverted() throws Exception {
      String expected = "{\"accents\": \"é à ê\"}";
      
      // Create the dummy JSON input document
      BytesDocument inputDoc = MockDocuments.mockByteDoc(
      	"http://foo.com/",
      	DocumentType.APPLICATION_JSON_TEXT,
      	Optional.empty(),
      	expected.getBytes("UTF-8"));
      
      // Create and run the filter
      FilterResult filterResult = new JSONUTF8ConverterFilter()
      	.filter(inputDoc, MockFilterContext.getEmptyContext());
      
      // Get the resulting filtered document from the filter result
      // (we assume a document will be returned)
      StringDocument filteredDocument = (StringDocument) filterResult
      	.getFilteredDocuments().get(0);
      
      Assert.assertEquals(
      	"Content was not correctly converted to a string",
      	expected,
      	filteredDocument.getContentAsString());
    }
    
    @Test
    public void testFilterOnlyRunsOnJsonDocuments() throws Exception {
      // Create a dummy HTML input document.
      BytesDocument inputDoc = MockDocuments.mockEmptyByteDoc()
      	.cloneWithContent(DocumentType.MIME_HTML_TEXT, Optional.empty(), "<html><p>Hello</p></html>".getBytes(UTF_8));
      
      // Create and run the filter.
      FilterResult filterResult = new JSONUTF8ConverterFilter()
      	.filter(inputDoc, MockFilterContext.getEmptyContext());
      
      Assert.assertTrue(
      	"Filter should have been skipped as the document was not a JSON document",
      	filterResult.isSkipped());
    }
  }
  
  // Running the main method will execute the test methods.
  public static void main(String[] args) throws Exception {
    FilterTestRunner.runTests(FilterTest.class);
  }

}

See also:

top

Funnelback logo
v15.24.0