package com.lucidworks.analysis1; import java.io.IOException; import java.util.ArrayList; import java.util.Iterator; import org.apache.lucene.util.AttributeImpl; import org.apache.lucene.analysis.TokenFilter; import org.apache.lucene.analysis.TokenStream; import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; import org.apache.lucene.analysis.util.CharArraySet; import org.apache.lucene.analysis.util.CharArrayMap; import org.slf4j.Logger; import org.slf4j.LoggerFactory; /** * Performs "auto phrasing" on a token stream. Auto phrases refer to sequences of tokens that are meant to describe a * single thing and should be searched for as such. When these phrases are detected in the token stream, a single token * representing the phrase is emitted rather than the individual tokens that make up the phrase. The filter supports * overlapping phrases. * * The Autophrasing filter can be combined with a synonym filter to handle cases in which prefix or suffix terms in a * phrase are synonymous with the phrase, but where other parts of the phrase are not. */ public class AutoPhrasingTokenFilter extends TokenFilter { private static final Logger Log = LoggerFactory.getLogger(AutoPhrasingTokenFilter.class); // The list of auto-phrase character strings private CharArrayMap phraseMap; // Set of first term in phrase to phrase(s) to be checked private CharArraySet currentSetToCheck = null; // The current phrase that has been seen in the token stream // since the first term match was encountered private StringBuffer currentPhrase = new StringBuffer(); // Queue to allow old tokens that ultimately did not match to be // emitted before new tokens are emitted so that the filter can // work 'transparently' private ArrayList unusedTokens = new ArrayList(); // If true - emit single tokens as well as auto-phrases private boolean emitSingleTokens; private char[] lastToken = null; private char[] lastEmitted = null; private char[] lastValid = null; private Character replaceWhitespaceWith = null; private int positionIncr = 0; public AutoPhrasingTokenFilter(TokenStream input, CharArraySet phraseSet, boolean emitSingleTokens) { super(input); // Convert to CharArrayMap by iterating the char[] strings and // putting them into the CharArrayMap with Integer of the number // of tokens in the map: need this to determine when a phrase match is completed. this.phraseMap = convertPhraseSet(phraseSet); this.emitSingleTokens = emitSingleTokens; } protected AutoPhrasingTokenFilter(TokenStream input) { super(input); } public void setReplaceWhitespaceWith(Character replaceWhitespaceWith) { this.replaceWhitespaceWith = replaceWhitespaceWith; } @Override public void reset() throws IOException { super.reset(); currentSetToCheck = null; currentPhrase.setLength(0); lastToken = null; lastEmitted = null; unusedTokens.clear(); positionIncr = 0; } @Override public final boolean incrementToken() throws IOException { if (!emitSingleTokens && unusedTokens.size() > 0) { Log.debug("emitting unused phrases 1"); // emit these until the queue is empty before emitting any new stuff Token aToken = unusedTokens.remove(0); emit(aToken); return true; } if (lastToken != null) { Log.debug("emit lastToken"); emit(lastToken); lastToken = null; return true; } char[] nextToken = nextToken(); // if (nextToken != null) System.out.println( "nextToken: " + new String( nextToken )); if (nextToken == null) { if (lastValid != null) { Log.debug("emit lastValid"); emit(lastValid); lastValid = null; return true; } if (emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { char[] phrase = getFirst(currentSetToCheck); char[] lastTok = getCurrentBuffer(new char[0]); if (phrase != null && endsWith(lastTok, phrase)) { currentSetToCheck = remove(currentSetToCheck, phrase); Log.debug("emit phrase"); emit(phrase); return true; } } else if (!emitSingleTokens && currentSetToCheck != null && currentSetToCheck.size() > 0) { char[] currBuff = getCurrentBuffer(new char[0]); if (lastEmitted != null && !equals(fixWhitespace(lastEmitted), currBuff)) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); // don't emit if current phrase not completed and overlaps with lastEmitted if (!endsWith(lastEmitted, currBuff)) { Log.debug("emitting putback token 2"); emit(aToken); return true; } } } } if (lastEmitted == null && (currentPhrase != null && currentPhrase.length() > 0)) { char[] lastTok = getCurrentBuffer(new char[0]); if (currentSetToCheck.contains(lastTok, 0, lastTok.length)) { Log.debug("emit lastTok "); emit(lastTok); currentPhrase.setLength(0); return true; } else if (!emitSingleTokens) { discardCharTokens(currentPhrase, unusedTokens); currentSetToCheck = null; currentPhrase.setLength(0); if (unusedTokens.size() > 0) { Token aToken = unusedTokens.remove(0); Log.debug("emitting putback token 3"); emit(aToken); return true; } } } return false; } // if emitSingleToken, set lastToken = nextToken if (emitSingleTokens) { lastToken = nextToken; } if (currentSetToCheck == null || currentSetToCheck.size() == 0) { // Log.info("Checking for phrase start on '" + new String(nextToken) + "'"); if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentSetTocheck /* some memroy code change */ // currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length ); currentSetToCheck = CharArraySet.copy(phraseMap.get(nextToken, 0, nextToken.length)); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); return incrementToken(); } else { Log.debug("emit nextToken"); emit(nextToken); // clear lastToken lastToken = null; return true; } } else { // add token to the current string buffer. char[] currentBuffer = getCurrentBuffer(nextToken); if (currentSetToCheck.contains(currentBuffer, 0, currentBuffer.length)) { // if its the only one valid, emit it // if there is a longer one, wait to see if it will be matched // if the longer one breaks on the next token, emit this one... // emit the current phrase currentSetToCheck = remove(currentSetToCheck, currentBuffer); if (currentSetToCheck.size() == 0) { emit(currentBuffer); lastValid = null; --positionIncr; } else { if (emitSingleTokens) { lastToken = currentBuffer; return true; } lastValid = currentBuffer; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck /* some memroy code change */ // currentSetToCheck = phraseMap.get(nextToken, 0, nextToken.length ); currentSetToCheck = CharArraySet.copy(phraseMap.get(nextToken, 0, nextToken.length)); if (currentPhrase == null) currentPhrase = new StringBuffer(); else currentPhrase.setLength(0); currentPhrase.append(nextToken); } return (lastValid != null) ? incrementToken() : true; } if (phraseMap.keySet().contains(nextToken, 0, nextToken.length)) { // get the phrase set for this token, add it to currentPhrasesTocheck // System.out.println( "starting new phrase with " + new String( nextToken ) ); // does this add all of the set? if not need iterator loop CharArraySet newSet = phraseMap.get(nextToken, 0, nextToken.length); Iterator