Index: org/apache/lucene/queryParser/QueryParser.jj =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/queryParser/QueryParser.jj,v retrieving revision 1.38 diff -u -r1.38 QueryParser.jj --- org/apache/lucene/queryParser/QueryParser.jj 26 Nov 2003 11:00:58 -0000 1.38 +++ org/apache/lucene/queryParser/QueryParser.jj 10 Jan 2004 00:41:29 -0000 @@ -71,6 +71,7 @@ import org.apache.lucene.analysis.*; import org.apache.lucene.document.*; import org.apache.lucene.search.*; +import org.apache.lucene.search.phonetic.*; /** * This class is generated by JavaCC. The only method that clients should need @@ -448,6 +449,23 @@ return new FuzzyQuery(t); } + /** + * Factory method for generating a query (similar to + * ({@link #getWildcardQuery}). Called when parser parses + * an input term token that has the phonetic suffix ($) appended. + * + * @param field Name of the field query will use. + * @param termStr Term token to use for building term for the query + * + * @return Resulting {@link Query} built for the term + * @exception ParseException throw in overridden method to disallow + */ + protected Query getPhoneticQuery(String field, String termStr) throws ParseException + { + Term t = new Term(field, termStr); + return new PhoneticQuery(t); + } + public static void main(String[] args) throws Exception { QueryParser qp = new QueryParser("field", new org.apache.lucene.analysis.SimpleAnalyzer()); @@ -465,9 +483,9 @@ <*> TOKEN : { <#_NUM_CHAR: ["0"-"9"] > | <#_ESCAPED_CHAR: "\\" [ "\\", "+", "-", "!", "(", ")", ":", "^", - "[", "]", "\"", "{", "}", "~", "*", "?" ] > + "[", "]", "\"", "{", "}", "~", "*", "?", "$" ] > | <#_TERM_START_CHAR: ( ~[ " ", "\t", "+", "-", "!", "(", ")", ":", "^", - "[", "]", "\"", "{", "}", "~", "*", "?" ] + "[", "]", "\"", "{", "}", "~", "*", "?", "$" ] | <_ESCAPED_CHAR> ) > | <#_TERM_CHAR: ( <_TERM_START_CHAR> | <_ESCAPED_CHAR> ) > | <#_WHITESPACE: ( " " | "\t" ) > @@ -499,6 +517,7 @@ | | (<_TERM_CHAR>)* > | +| (<_TERM_CHAR>)* "$" > | )+ > | (<_TERM_CHAR>)* "*" > | @@ -611,6 +630,7 @@ boolean wildcard = false; boolean fuzzy = false; boolean rangein = false; + boolean phonetic = false; Query q; } { @@ -619,6 +639,7 @@ term= | term= { prefix=true; } | term= { wildcard=true; } + | term= { phonetic=true; } | term= ) [ { fuzzy=true; } ] @@ -629,6 +650,8 @@ } else if (prefix) { q = getPrefixQuery(field, term.image.substring (0, term.image.length()-1)); + } else if (phonetic) { + q = getPhoneticQuery(field, term.image.substring(0,term.image.length()-1)); } else if (fuzzy) { q = getFuzzyQuery(field, term.image); } else { Index: org/apache/lucene/search/IndexSearcher.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/IndexSearcher.java,v retrieving revision 1.11 diff -u -r1.11 IndexSearcher.java --- org/apache/lucene/search/IndexSearcher.java 16 Sep 2003 20:06:32 -0000 1.11 +++ org/apache/lucene/search/IndexSearcher.java 10 Jan 2004 00:41:30 -0000 @@ -125,6 +125,10 @@ */ public TopDocs search(Query query, Filter filter, final int nDocs) throws IOException { + + // premptive query rewrite to avoid multiple rewrite during weight calculation + query = rewrite(query); + Scorer scorer = query.weight(this).scorer(reader); if (scorer == null) return new TopDocs(0, new ScoreDoc[0]); @@ -163,8 +167,11 @@ * @param filter if non-null, a bitset used to eliminate some documents * @param results to receive hits */ - public void search(Query query, Filter filter, - final HitCollector results) throws IOException { + public void search(Query query, Filter filter, final HitCollector results) + throws IOException { + + query = rewrite(query); + HitCollector collector = results; if (filter != null) { final BitSet bits = filter.bits(reader); Index: org/apache/lucene/search/PhraseScorer.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/PhraseScorer.java,v retrieving revision 1.8 diff -u -r1.8 PhraseScorer.java --- org/apache/lucene/search/PhraseScorer.java 11 Sep 2003 01:25:47 -0000 1.8 +++ org/apache/lucene/search/PhraseScorer.java 10 Jan 2004 00:41:30 -0000 @@ -101,7 +101,9 @@ if (freq > 0.0) { float score = similarity.tf(freq) * value; // compute score - score *= Similarity.decodeNorm(norms[first.doc]); // normalize + // check for 'null' norms + if(norms!=null) + score *= Similarity.decodeNorm(norms[first.doc]); // normalize results.collect(first.doc, score); // add to results } last.next(); // resume scanning Index: org/apache/lucene/search/TermScorer.java =================================================================== RCS file: /home/cvspublic/jakarta-lucene/src/java/org/apache/lucene/search/TermScorer.java,v retrieving revision 1.6 diff -u -r1.6 TermScorer.java --- org/apache/lucene/search/TermScorer.java 29 Jan 2003 17:18:55 -0000 1.6 +++ org/apache/lucene/search/TermScorer.java 10 Jan 2004 00:41:31 -0000 @@ -104,7 +104,9 @@ ? scoreCache[f] // cache hit : similarity.tf(f)*weightValue; // cache miss - score *= Similarity.decodeNorm(norms[d]); // normalize for field + // check for 'null' norms + if(norms!=null) + score *= Similarity.decodeNorm(norms[d]); // normalize for field c.collect(d, score); // collect score Index: org/apache/lucene/search/phonetic/DoubleMetaphone.java =================================================================== RCS file: org/apache/lucene/search/phonetic/DoubleMetaphone.java diff -N org/apache/lucene/search/phonetic/DoubleMetaphone.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/DoubleMetaphone.java 10 Jan 2004 00:41:32 -0000 @@ -0,0 +1,965 @@ + +/* + Phonetix: a class-library of phonetic algorithms. + Copyright (C) 2001-2003 Claus Engel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +package org.apache.lucene.search.phonetic; + +/** + * Encoder implementing the phonetic algorithm "DoubleMetaphone". + * DoubleMetaphone was developed by Lawrence Philips + * and published in the June 2000 issue of the C/C++ Users Journal. + * DoubleMetaphone is an improvement + * of Philips' original Metaphone algorithm. + *

+ * DoubleMetaphone sometimes generates two encodings of a word, + * instead of only one like Soundex and Metaphone. + * The first encoding is usually based on the most commonly heard + * pronounciation of the word in the U.S.A. + * For more information on DoubleMetaphone, please see Philips' article + * article. + *

+ * This implementation is based on the + * original Visual C++ implementation + * by Lawrence Philips with + * slight modifications and bug fixes + * by Kevin Atkinson. + * + * @see Metaphone + * @see Soundex + * @author Claus Engel + */ +public final class DoubleMetaphone extends MetaphoneEncoder +{ + private final static String[] GN_KN_PN_WR_PS = new String[] { "GN", "KN", "PN", "WR", "PS" }; + private final static String[] ACH = new String[] { "ACH" }; + private final static String[] BACHER_MACHER = new String[] { "BACHER", "MACHER" }; + private final static String[] CAESAR = new String[] { "CAESAR" }; + private final static String[] CHIA = new String[] { "CHIA" }; + private final static String[] CH = new String[] { "CH" }; + private final static String[] CHAE = new String[] { "CHAE" }; + private final static String[] HARAC_HARIS_HOR_HYM_HIA_HEM = new String[] { "HARAC", "HARIS", "HOR", "HYM", "HIA", "HEM" }; + private final static String[] CHORE = new String[] { "CHORE" }; + private final static String[] SCH = new String[] { "SCH" }; + private final static String[] VAN__VON__SCH = new String[] { "VAN ", "VON ", "SCH" }; + private final static String[] ORCHES_ARCHIT_ORCHID = new String[] { "ORCHES", "ARCHIT", "ORCHID" }; + private final static String[] T_S = new String[] { "T", "S" }; + private final static String[] A_O = new String[] { "A", "O" }; + private final static String[] A_O_U_E = new String[] { "A", "O", "U", "E" }; + private final static String[] L_R_N_M_B_H_F_V_W__ = new String[] { "L", "R", "N", "M", "B", "H", "F", "V", "W", " " }; + private final static String[] MC = new String[] { "MC" }; + private final static String[] CZ = new String[] { "CZ" }; + private final static String[] WICZ = new String[] { "WICZ" }; + private final static String[] CIA = new String[] { "CIA" }; + private final static String[] CC = new String[] { "CC" }; + private final static String[] I_E_H = new String[] { "I", "E", "H" }; + private final static String[] HU = new String[] { "HU" }; + private final static String[] UCCEE_UCCES = new String[] { "UCCEE", "UCCES" }; + private final static String[] CK_CG_CQ = new String[] { "CK", "CG", "CQ" }; + private final static String[] CI_CE_CY = new String[] { "CI", "CE", "CY" }; + private final static String[] CIO_CIE_CIA = new String[] { "CIO", "CIE", "CIA" }; + private final static String[] _C__Q__G = new String[] { " C", " Q", " G" }; + private final static String[] C_K_Q = new String[] { "C", "K", "Q" }; + private final static String[] CE_CI = new String[] { "CE", "CI" }; + private final static String[] DG = new String[] { "DG" }; + private final static String[] I_E_Y = new String[] { "I", "E", "Y" }; + private final static String[] DT_DD = new String[] { "DT", "DD" }; + private final static String[] B_H_D = new String[] { "B", "H", "D" }; + private final static String[] B_H = new String[] { "B", "H" }; + private final static String[] C_G_L_R_T = new String[] { "C", "G", "L", "R", "T" }; + private final static String[] EY = new String[] { "EY" }; + private final static String[] LI = new String[] { "LI" }; + private final static String[] Y_ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER= new String[] { "Y", "ES", "EP", "EB", "EL", "EY", "IB", "IL", "IN", "IE", "EI", "ER" }; + private final static String[] Y_ER = new String[] { "Y", "ER" }; + private final static String[] DANGER_RANGER_MANGER = new String[] { "DANGER", "RANGER", "MANGER" }; + private final static String[] E_I = new String[] { "E", "I" }; + private final static String[] RGY_OGY = new String[] { "RGY", "OGY" }; + private final static String[] E_I_Y = new String[] { "E", "I", "Y" }; + private final static String[] AGGI_OGGI = new String[] { "AGGI", "OGGI" }; + private final static String[] ET = new String[] { "ET" }; + private final static String[] IER_ = new String[] { "IER " }; + private final static String[] JOSE = new String[] { "JOSE" }; + private final static String[] SAN_ = new String[] { "SAN " }; + private final static String[] L_T_K_S_N_M_B_Z = new String[] { "L", "T", "K", "S", "N", "M", "B", "Z" }; + private final static String[] S_K_L = new String[] { "S", "K", "L" }; + private final static String[] ILLO_ILLA_ALLE = new String[] { "ILLO", "ILLA", "ALLE" }; + private final static String[] AS_OS = new String[] { "AS", "OS" }; + private final static String[] ALLE = new String[] { "ALLE" }; + private final static String[] UMB = new String[] { "UMB" }; + private final static String[] P_B = new String[] { "P", "B" }; + private final static String[] IE = new String[] { "IE" }; + private final static String[] IER = new String[] { "IER" }; + private final static String[] ER = new String[] { "ER" }; + private final static String[] ME_MA = new String[] { "ME", "MA" }; + private final static String[] ISL_YSL = new String[] { "ISL", "YSL" }; + private final static String[] SUGAR = new String[] { "SUGAR" }; + private final static String[] SH = new String[] { "SH" }; + private final static String[] HEIM_HOEK_HOLM_HOLZ = new String[] { "HEIM", "HOEK", "HOLM", "HOLZ" }; + private final static String[] SIO_SIA = new String[] { "SIO", "SIA" }; + private final static String[] SIAN = new String[] { "SIAN" }; + private final static String[] M_N_L_W = new String[] { "M", "N", "L", "W" }; + private final static String[] Z = new String[] { "Z" }; + private final static String[] SC = new String[] { "SC" }; + private final static String[] OO_ER_EN_UY_ED_EM = new String[] { "OO", "ER", "EN", "UY", "ED", "EM" }; + private final static String[] ER_EN = new String[] { "ER", "EN" }; + private final static String[] AI_OI = new String[] { "AI", "OI" }; + private final static String[] S_Z = new String[] { "S", "Z" }; + private final static String[] TION = new String[] { "TION" }; + private final static String[] TIA_TCH = new String[] { "TIA", "TCH" }; + private final static String[] TH_TTH = new String[] { "TH", "TTH" }; + private final static String[] OM_AM = new String[] { "OM", "AM" }; + private final static String[] T_D = new String[] { "T", "D" }; + private final static String[] WR = new String[] { "WR" }; + private final static String[] WH = new String[] { "WH" }; + private final static String[] EWSKI_EWSKY_OWSKI_OWSKY = new String[] { "EWSKI", "EWSKY", "OWSKI", "OWSKY" }; + private final static String[] WICZ_WITZ = new String[] { "WICZ", "WITZ" }; + private final static String[] IAU_EAU = new String[] { "IAU", "EAU" }; + private final static String[] AU_OU = new String[] { "AU", "OU" }; + private final static String[] C_X = new String[] { "C", "X" }; + private final static String[] ZO_ZI_ZA = new String[] { "ZO", "ZI", "ZA" }; + + private StringBuffer primaryBuffer; + private StringBuffer secondaryBuffer; + private boolean hasAlternate; + + /** + * Constructs a DoubleMetaphone encoder which generates keys of given + * maximal length. + * @param maxLength the maximal length of the generated keys. If negative, + * the lengths of the keys returned are only limited + * by the lengths of the words to encode. + */ + public DoubleMetaphone (int maxLength) + { + super (maxLength); + } + + /** + * Constructs a DoubleMetaphone encoder which generates keys with + * maximal length 4. + */ + public DoubleMetaphone () + { + super (4); + } + + /** + * Returns a String identifying the algorithm. + */ + public String toString() + { + return "DoubleMetaphone_"+maxLength; + } + + private void add (final String main) + { + primaryBuffer.append(main); + secondaryBuffer.append(main); + } + + private void add (final String main, final String alternate) + { + primaryBuffer.append(main); + if (alternate.length() > 0) + { + hasAlternate = true; + if (!alternate.equals(" ")) + secondaryBuffer.append(alternate); + } + else + { + if (main.length() > 0 && !main.equals(" ")) + secondaryBuffer.append(main); + } + } + + private static boolean isSlavoGermanic (final String string) + { + return (string.indexOf('W') >= 0) || + (string.indexOf('K') >= 0) || + (string.indexOf("CZ") >= 0) || + (string.indexOf("WITZ") >= 0); + } + + /** + * Returns an encoding of the given word, that is based on the most + * commonly heard pronounciation of the word in the U.S.A. + * @param word the word to encode. + * @return the encoding of the word. This is never null. + */ + public String generateKey (final String word) + { + final String[] result = generateKeys(word); + return (result.length > 0) ? result[0] : ""; + } + + /** + * Returns the encodings of the given word. The first is based on the most + * commonly heard pronounciation of the word in the U.S.A. + * @param word the word to encode. + * @return an array of the encodings of the word. + * This is never null. + */ + public synchronized String[] generateKeys (String word) + { + if (word == null || word.length() == 0) + return EMPTY_KEYS; + + primaryBuffer = new StringBuffer(word.length()); + secondaryBuffer = new StringBuffer(word.length()); + hasAlternate = false; + + word = word.toUpperCase(); + + final int length = word.length(); + final int last = length -1; + + final boolean isSlavoGermanic = isSlavoGermanic(word); + + int n = 0; + + // skip these when at start of word + if (match(word,0,GN_KN_PN_WR_PS)) + n += 1; + + // initial 'X' is pronounced 'Z' e.g. 'Xavier' + if (match(word,0,'X')) + { + add("S"); //'Z' maps to 'S' + n += 1; + } + + while (n < length && (maxLength < 0 || (primaryBuffer.length() < maxLength && secondaryBuffer.length() < maxLength))) + { + switch (word.charAt(n)) + { + case 'A': + case 'E': + case 'I': + case 'O': + case 'U': + case 'Y': + if (n == 0) + //all init vowels now map to 'A' + add("A"); + n += 1; + break; + + case 'B': + //"-mb", e.g", "dumb", already skipped over... + add("P"); + n += match(word,n+1,'B') ? 2 : 1; + break; + + case 'Ç': + add("S"); + n += 1; + break; + + case 'C': + // various germanic + if ((n > 1) && + !isVowel(word,n-2) && + match(word,n-1,ACH) && + !match(word,n+2,'I') && + (!match(word,n+2,'E') || match(word,n-2,BACHER_MACHER))) + { + add("K"); + n += 2; + break; + } + + // special case 'caesar' + if ((n == 0) && match(word,n,CAESAR)) + { + add("S"); + n += 2; + break; + } + + // italian 'chianti' + if (match(word,n,CHIA)) + { + add("K"); + n += 2; + break; + } + + if (match(word,n,CH)) + { + // find 'michael' + if ((n > 0) && match(word,n,CHAE)) + { + add("K", "X"); + n += 2; + break; + } + + // greek roots e.g. 'chemistry', 'chorus' + if ((n == 0) && + match(word,n+1,HARAC_HARIS_HOR_HYM_HIA_HEM) && + !match(word,0,CHORE)) + { + add("K"); + n += 2; + break; + } + + // germanic, greek, or otherwise 'ch' for 'kh' sound + if (match(word,0,VAN__VON__SCH) + // 'architect' but not 'arch', 'orchestra', 'orchid' + || match(word,n-2,ORCHES_ARCHIT_ORCHID) + || match(word,n+2,T_S) + || ( ((n==0) || match(word,n-1,A_O_U_E)) && + //e.g., 'wachtler', 'wechsler', but not 'tichner' + match(word,n+2,L_R_N_M_B_H_F_V_W__) )) + { + add("K"); + } + else + { + if (n > 0) + { + if (match(word,0,MC)) + // e.g., "McHugh" + add("K"); + else + add("X", "K"); + } + else + add("X"); + } + n += 2; + break; + } + + // e.g., 'czerny' + if (match(word,n,CZ) && !match(word,n-2,WICZ)) + { + add("S", "X"); + n += 2; + break; + } + + // e.g., 'focaccia' + if (match(word,n+1,CIA)) + { + add("X"); + n += 3; + break; + } + + // double 'C', but not if e.g. 'McClellan' + if (match(word,n,CC) && !((n == 1) && match(word,0,'M'))) + { + // 'bellocchio' but not 'bacchus' + if (match(word,n+2,I_E_H) && !match(word,n+2,HU)) + { + // 'accident', 'accede', 'succeed' + if (((n == 1) && match(word,n-1,'A')) + || match(word,n-1,UCCEE_UCCES)) + add("KS"); + // 'bacci', 'bertucci', other italian + else + add("X"); + n += 3; + break; + } + else + { + // Pierce's rule + add("K"); + n += 2; + break; + } + } + + if (match(word,n,CK_CG_CQ)) + { + add("K"); + n += 2; + break; + } + + if (match(word,n,CI_CE_CY)) + { + // italian vs. english + if (match(word,n,CIO_CIE_CIA)) + add("S", "X"); + else + add("S"); + n += 2; + break; + } + + // else + add("K"); + + // name sent in 'mac caffrey', 'mac gregor' + if (match(word,n+1,_C__Q__G)) + n += 3; + else + n += (match(word,n+1,C_K_Q) && !match(word,n+1,CE_CI)) ? 2 : 1; + break; + + case 'D': + if (match(word,n,DG)) + { + if (match(word,n+2,I_E_Y)) + { + // e.g. 'edge' + add("J"); + n += 3; + break; + } + else + { + // e.g. 'edgar' + add("TK"); + n += 2; + break; + } + } + + if (match(word,n,DT_DD)) + { + add("T"); + n += 2; + break; + } + + // else + add("T"); + n += 1; + break; + + case 'F': + n += match(word,n+1,'F') ? 2 : 1; + add("F"); + break; + + case 'G': + if (match(word,n+1,'H')) + { + if ((n > 0) && !isVowel(word,n-1)) + { + add("K"); + n += 2; + break; + } + + if (n < 3) + { + // 'ghislane', 'ghiradelli' + if (n == 0) + { + if (match(word,n+2,'I')) + add("J"); + else + add("K"); + n += 2; + break; + } + } + + // Parker's rule (with some further refinements) - e.g., 'hugh' + if(((n > 1) && match(word,n-2,B_H_D)) + // e.g., 'bough' + || ((n > 2) && match(word,n-3,B_H_D)) + // e.g., 'broughton' + || ((n > 3) && match(word,n-4,B_H)) ) + { + n += 2; + break; + } + else + { + // e.g., 'laugh', 'McLaughlin', 'cough', 'gough', 'rough', 'tough' + if ((n > 2) && match(word,n-1,'U') && match(word,n-3,C_G_L_R_T)) + { + add("F"); + } + else if ((n > 0) && !match(word,n-1,'I')) + { + add("K"); + } + + n += 2; + break; + } + } + + if (match(word,n+1,'N')) + { + if ((n == 1) && isVowel(word,0) && !isSlavoGermanic) + { + add("KN", "N"); + } + else + { + // not e.g. 'cagney' + if (!match(word,n+2,EY) && !match(word,n+1,'Y') && !isSlavoGermanic) + { + add("N", "KN"); + } + else + { + add("KN"); + } + } + n += 2; + break; + } + + // 'tagliaro' + if (match(word,n+1,LI) && !isSlavoGermanic) + { + add("KL", "L"); + n += 2; + break; + } + + // -ges-,-gep-,-gel-, -gie- at beginning + if ((n == 0) && match(word,n+1,Y_ES_EP_EB_EL_EY_IB_IL_IN_IE_EI_ER)) + { + add("K", "J"); + n += 2; + break; + } + + // -ger-, -gy- + if (match(word,n+1,Y_ER) && + !match(word,0,DANGER_RANGER_MANGER) && + !match(word,n-1,E_I) && + !match(word,n-1,RGY_OGY)) + { + add("K", "J"); + n += 2; + break; + } + + // italian e.g, 'biaggi' + if (match(word,n+1,E_I_Y) || match(word,n-1,AGGI_OGGI)) + { + // obvious germanic + if (match(word,0,VAN__VON__SCH) || match(word,n+1,ET)) + { + add("K"); + } + else + { + // always soft if french ending + if (match(word,n+1,IER)) + add("J"); + else + add("J", "K"); + } + n += 2; + break; + } + + add("K"); + n += match(word,n+1,'G') ? 2 : 1; + break; + + case 'H': + // only keep if first & before vowel or btw. 2 vowels + if (((n == 0) || isVowel(word,n-1)) && isVowel(word,n+1)) + { + add("H"); + n += 2; + } + else + { + //also takes care of 'HH' + n += 1; + } + break; + + case 'J': + // obvious spanish, 'jose', 'san jacinto' + if (match(word,n,JOSE) || match(word,0,SAN_)) + { + if (((n == 0) && match(word,n+4,' ')) || match(word,0,SAN_)) + { + add("H"); + } + else + { + add("J", "H"); + } + n += 1; + break; + } + + if ((n == 0) && !match(word,n,JOSE)) + { + add("J", "A");//Yankelovich/Jankelowicz + } + else + { + // spanish pron. of e.g. 'bajador' + if (isVowel(word,n-1) && !isSlavoGermanic && match(word,n+1,A_O)) + { + add("J", "H"); + } + else + { + if (n == last) + { + add("J", " "); + } + else + { + if (!match(word,n+1,L_T_K_S_N_M_B_Z) && + !match(word,n-1,S_K_L)) + add("J"); + } + } + } + + n += match(word,n+1,'J') ? 2 : 1; + break; + + case 'K': + n += match(word,n+1,'K') ? 2 : 1; + add("K"); + break; + + case 'L': + if (match(word,n+1,'L')) + { + // spanish e.g. 'cabrillo', 'gallegos' + if (((n == length-3) && match(word,n-1,ILLO_ILLA_ALLE)) + || ( (match(word,last-1,AS_OS) || match(word,last,A_O)) + && match(word,n-1,ALLE) )) + { + add("L", " "); + n += 2; + break; + } + n += 2; + } + else + { + n += 1; + } + add("L"); + break; + + case 'M': + if ((match(word,n-1,UMB) && ((n+1 == last) || match(word,n+2,ER))) + //'dumb','thumb' + || match(word,n+1,'M')) + { + n += 2; + } + else + { + n += 1; + } + add("M"); + break; + + case 'N': + n += match(word,n+1,'N') ? 2 : 1; + add("N"); + break; + + case 'Ñ': + n += 1; + add("N"); + break; + + case 'P': + if (match(word,n+1,'H')) + { + add("F"); + n += 2; + break; + } + + // also account for "campbell", "raspberry" + n += match(word,n+1,P_B) ? 2 : 1; + add("P"); + break; + + case 'Q': + n += match(word,n+1,'Q') ? 2 : 1; + add("K"); + break; + + case 'R': + // french e.g. 'rogier', but exclude 'hochmeier' + if ((n == last) && + !isSlavoGermanic && + match(word,n-2,IE) && + !match(word,n-4,ME_MA)) + { + add("", "R"); + } + else + { + add("R"); + } + + n += match(word,n+1,'R') ? 2 : 1; + break; + + case 'S': + // special cases 'island', 'isle', 'carlisle', 'carlysle' + if (match(word,n-1,ISL_YSL)) + { + n += 1; + break; + } + + // special case 'sugar-' + if ((n == 0) && match(word,n,SUGAR)) + { + add("X", "S"); + n += 1; + break; + } + + if (match(word,n,SH)) + { + // germanic + if (match(word,n+1,HEIM_HOEK_HOLM_HOLZ)) + add("S"); + else + add("X"); + n += 2; + break; + } + + // italian & armenian + if (match(word,n,SIO_SIA) || match(word,n,SIAN)) + { + if (!isSlavoGermanic) + add("S", "X"); + else + add("S"); + n += 3; + break; + } + + // german & anglicisations, e.g. 'smith' match 'schmidt', 'snider' match 'schneider' + // also, -sz- in slavic language altho in hungarian it is pronounced 's' + if (((n == 0) && match(word,n+1,M_N_L_W)) || match(word,n+1,'Z')) + { + add("S", "X"); + n += match(word,n+1,'Z') ? 2 : 1; + break; + } + + if (match(word,n,SC)) + { + // Schlesinger's rule + if (match(word,n+2,'H')) + { + // dutch origin, e.g. 'school', 'schooner' + if (match(word,n+3,OO_ER_EN_UY_ED_EM)) + { + // 'schermerhorn', 'schenker' + if (match(word,n+3,ER_EN)) + add("X", "SK"); + else + add("SK"); + n += 3; + break; + } + else + { + if((n == 0) && !isVowel(word,3) && !match(word,3,'W')) + add("X", "S"); + else + add("X"); + n += 3; + break; + } + } + + if (match(word,n+2,I_E_Y)) + add("S"); + else + add("SK"); + n += 3; + break; + } + + // french e.g. 'resnais', 'artois' + if ((n == last) && match(word,n-2,AI_OI)) + add("", "S"); + else + add("S"); + + n += match(word,n+1,S_Z) ? 2 : 1; + break; + + case 'T': + if (match(word,n,TION)) + { + add("X"); + n += 3; + break; + } + + if (match(word,n,TIA_TCH)) + { + add("X"); + n += 3; + break; + } + + if (match(word,n,TH_TTH)) + { + // special case 'thomas', 'thames' or germanic + if (match(word,n+2,OM_AM) || match(word,0,VAN__VON__SCH)) + add("T"); + else + add("0", "T"); + n += 2; + break; + } + + n += match(word,n+1,T_D) ? 2 : 1; + add("T"); + break; + + case 'V': + n += match(word,n+1,'V') ? 2 : 1; + add("F"); + break; + + case 'W': + // can also be in middle of word + if (match(word,n,WR)) + { + add("R"); + n += 2; + break; + } + + if ((n == 0) && (isVowel(word,n+1) || match(word,n,WH))) + { + // Wasserman should match Vasserman + if (isVowel(word,n+1)) + add("A", "F"); + else + //need Uomo to match Womo + add("A"); + } + + // Arnow should match Arnoff + if (((n == last) && isVowel(word,n-1)) + || match(word,n-1,EWSKI_EWSKY_OWSKI_OWSKY) + || match(word,0,SCH)) + { + add("", "F"); + n += 1; + break; + } + + // polish e.g. 'filipowicz' + if (match(word,n,WICZ_WITZ)) + { + add("TS", "FX"); + n += 4; + break; + } + + // else skip it + n += 1; + break; + + case 'X': + // french e.g. breaux + if (!((n == last) && (match(word,n-3,IAU_EAU) || match(word,n-2,AU_OU)))) + add("KS"); + + n += match(word,n+1,C_X) ? 2 : 1; + break; + + case 'Z': + // chinese pinyin e.g. 'zhao' + if (match(word,n+1,'H')) + { + add("J"); + n += 2; + break; + } + else + { + if (match(word,n+1,ZO_ZI_ZA) + || (isSlavoGermanic && (n > 0) && !match(word,n-1,'T'))) + { + add("S", "TS"); + } + else + add("S"); + } + + n += match(word,n+1,'Z') ? 2 : 1; + break; + + default: + n += 1; + } + } + + if (maxLength < 0) + { + // result with unlimited length + if (hasAlternate) + return new String[] { primaryBuffer.toString(), secondaryBuffer.toString() }; + else + return new String[] { primaryBuffer.toString() }; + } + else + { + // limit the length of the resulting strings + final int primaryLength = Math.min(maxLength, primaryBuffer.length()); + if (hasAlternate) + { + final int secondaryLength = Math.min(maxLength, secondaryBuffer.length()); + return new String[] { primaryBuffer.toString().substring(0,primaryLength), + secondaryBuffer.toString().substring(0,secondaryLength) }; + } + else + return new String[] { primaryBuffer.toString().substring(0,primaryLength) }; + } + } + + /** + * Test of algorithm with default constructed encoder. + * The encoded arguments are printed to System.out. + */ + public static void main (String[] argv) + { + final DoubleMetaphone generator = new DoubleMetaphone(); + + for (int n = 0; n < argv.length; n++) + { + String[] keys = generator.generateKeys(argv[n]); + for (int m=0; m < keys.length; m++) + System.out.print(keys[m] + " "); + System.out.println(); + } + } +} + Index: org/apache/lucene/search/phonetic/Metaphone.java =================================================================== RCS file: org/apache/lucene/search/phonetic/Metaphone.java diff -N org/apache/lucene/search/phonetic/Metaphone.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/Metaphone.java 10 Jan 2004 00:41:32 -0000 @@ -0,0 +1,300 @@ + +/* + Phonetix: a class-library of phonetic algorithms. + Copyright (C) 2001-2003 Claus Engel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +package org.apache.lucene.search.phonetic; + +/** + * Encoder implementing the phonetic algorithm "Metaphone". + * Metaphone was originally developed by Lawrence Philips. + * This implementation is based on his original BASIC implementation. + * + * @see DoubleMetaphone + * @see Soundex + * @author Claus Engel + */ +public final class Metaphone extends MetaphoneEncoder +{ + private final static String[] GN_KN_PN_WR_AE = new String[] { "GN", "KN", "PN", "WR", "AE" }; + private final static String[] WH = new String[] { "WH" }; + private final static String[] IA = new String[] { "IA" }; + private final static String[] SCE_SCI_SCY = new String[] { "SCE", "SCI", "SCY" }; + private final static String[] E_I_Y = new String[] { "E", "I", "Y" }; + private final static String[] SCH = new String[] { "SCH" }; + private final static String[] GE_GI_GY = new String[] { "GE", "GI", "GY" }; + private final static String[] NED = new String[] { "NED" }; + private final static String[] DGE_DGI_DGY = new String[] { "DGE", "DGI", "DGY" }; + private final static String[] C_S_P_T_G = new String[] { "C", "S", "P", "T", "G" }; + private final static String[] IO_IA = new String[] { "IO", "IA" }; + private final static String[] CH = new String[] { "CH" }; + + /** + * Constructs a Metaphone encoder which generates keys of given + * maximal length. + * @param maxLength the maximal length of the generated keys. If negative, + * the lengths of the keys returned are only limited + * by the lengths of the words to encode. + */ + public Metaphone (int maxLength) + { + super (maxLength); + } + + /** + * Constructs a Metaphone encoder which generates keys with + * maximal length 4. + */ + public Metaphone () + { + super (4); + } + + /** + * Returns a String identifying the algorithm. + */ + public String toString() + { + return "Metaphone_"+maxLength; + } + + /** + * Returns the encoding of the given word. + * @param word the word to encode. + * @return an array with the encoding of the word. + * This is never null. + */ + public String[] generateKeys (final String word) + { + return (word!=null && word.length()>0) ? new String[] {generateKey(word)} : EMPTY_KEYS; + } + + /** + * Returns the encoding of the given word. + * @param word the word to encode. + * @return the encoding of the word. This is never null. + */ + public String generateKey (String word) + { + if (word == null || word.length() == 0) + return ""; + + final StringBuffer buffer = new StringBuffer(word.length()); + + word = word.toUpperCase(); + + if (match(word,0,GN_KN_PN_WR_AE)) + word = word.substring(1); + + if (match(word,0,'X')) + word = "S" + word.substring(1); + + if (match(word,0,WH)) + word = "W" + word.substring(2); + + final int length = word.length(); + final int last = length -1; + + for (int n=0; n < length && (maxLength < 0 || (buffer.length() < maxLength)); n++) + { + char c = word.charAt(n); + if (c != 'C' && n > 0 && match(word,n-1,c)) + continue; + + switch (c) + { + case 'A': + if (n == 0) + buffer.append('A'); + break; + + case 'B': + if ((n != last) || !match(word,n-1,'M')) + buffer.append('B'); + break; + + case 'C': + if (!match(word,n-1,SCE_SCI_SCY)) + { + if (match(word,n+1,IA)) + buffer.append('X'); + else if (match(word,n+1,E_I_Y)) + buffer.append('S'); + else if (match(word,n-1,SCH)) + buffer.append('K'); + else if (match(word,n+1,'H')) + buffer.append( (n==0 && !isVowel(word,n+2)) ? 'K' : 'X' ); + else + buffer.append('K'); + } + break; + + case 'D': + buffer.append( match(word,n+1,GE_GI_GY) ? 'J' : 'T' ); + break; + + case 'E': + if (n == 0) + buffer.append('E'); + break; + + case 'F': + buffer.append('F'); + break; + + case 'G': + boolean silent = match(word,n+1,'H') && !isVowel(word,n+2); + + if (n > 0) + { + if ( (n+1 == last && match(word,n+1,'N')) || // -GN + (n+3 == last && match(word,n+1,NED)) ) // -GNED + silent = true; + + if (match(word,n-1,DGE_DGI_DGY)) // -DGE- -DGI- -DGY- + silent = true; + } + + if (!silent) + buffer.append( (match(word,n+1,E_I_Y) && !match(word,n-1,'G')) ? 'J' : 'K' ); + break; + + case 'H': + if (n < last && !match(word,n-1,C_S_P_T_G) && isVowel(word,n+1)) + buffer.append('H'); + break; + + case 'I': + if (n == 0) + buffer.append('I'); + break; + + case 'J': + buffer.append('J'); + break; + + case 'K': + if ((n==0) || !match(word,n-1,'C')) + buffer.append('K'); + break; + + case 'L': + buffer.append('L'); + break; + + case 'M': + buffer.append('M'); + break; + + case 'N': + buffer.append('N'); + break; + + case 'O': + if (n == 0) + buffer.append('O'); + break; + + case 'P': + buffer.append( match(word,n+1,'H') ? 'F' : 'P' ); + break; + + case 'Q': + buffer.append('K'); + break; + + case 'R': + buffer.append('R'); + break; + + case 'S': + if (match(word,n+1,IO_IA)) + buffer.append('X'); + else + buffer.append( match(word,n+1,'H') ? 'X' : 'S' ); + break; + + case 'T': + if (match(word,n+1,IO_IA)) + buffer.append('X'); + else if (match(word,n+1,'H')) + { + if (!match(word,n-1,'T')) + buffer.append('0'); + } + else + { + if (!match(word,n+1,CH)) + buffer.append('T'); + } + break; + + case 'U': + if (n == 0) + buffer.append('U'); + break; + + case 'V': + buffer.append('F'); + break; + + case 'W': + if (isVowel(word,n+1)) + buffer.append('W'); + break; + + case 'X': + buffer.append("KS"); + break; + + case 'Y': + if (isVowel(word,n+1)) + buffer.append('Y'); + break; + + case 'Z': + buffer.append('S'); + break; + } + } + + if (maxLength < 0) + { + // result with unlimited length + return buffer.toString(); + } + else + { + // limit the length of the resulting strings + final int bufferLength = Math.min(maxLength, buffer.length()); + return buffer.toString().substring(0,bufferLength); + } + } + + /** + * Test of algorithm with default constructed encoder. + * The encoded arguments are printed to System.out. + */ + public static void main (String[] argv) + { + final Metaphone generator = new Metaphone(); + + for (int n = 0; n < argv.length; n++) + System.out.println(generator.generateKeys(argv[n])[0]); + } +} + Index: org/apache/lucene/search/phonetic/MetaphoneEncoder.java =================================================================== RCS file: org/apache/lucene/search/phonetic/MetaphoneEncoder.java diff -N org/apache/lucene/search/phonetic/MetaphoneEncoder.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/MetaphoneEncoder.java 10 Jan 2004 00:41:32 -0000 @@ -0,0 +1,70 @@ + +/* + Phonetix: a class-library of phonetic algorithms. + Copyright (C) 2001-2003 Claus Engel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +package org.apache.lucene.search.phonetic; + +/** + * Abstract base class of "metaphone"-encoders. + * + * @author Claus Engel + */ +public abstract class MetaphoneEncoder extends PhoneticEncoder +{ + protected final int maxLength; + + /** + * Constructs a phonetic encoder of type "metaphone". + * @param maxLength the maximal length of the keys to generate by + * this MetaphoneEncoder. If the given + * length is lower than zero, the lengths of the generated + * keys are only limited by the size of the words to encode. + */ + public MetaphoneEncoder (int maxLength) + { + this.maxLength = maxLength; + } + + protected static boolean isVowel (final String string, int pos) + { + if (pos < 0 || string.length() <= pos) + return false; + + final char c = string.charAt(pos); + return c=='A' || c=='E' || c=='I' || c=='O' || c=='U'; + } + + protected static boolean match (final String string, int pos, final String[] strings) + { + if (0 <= pos && pos < string.length()) + { + for (int n=strings.length-1; n >= 0; n--) + { + if (string.regionMatches(pos, strings[n], 0, strings[n].length())) + return true; + } + } + return false; + } + + protected static boolean match (final String string, int pos, char c) + { + return (0 <= pos && pos < string.length()) ? string.charAt(pos) == c : false; + } +} + Index: org/apache/lucene/search/phonetic/PhoneticEncoder.java =================================================================== RCS file: org/apache/lucene/search/phonetic/PhoneticEncoder.java diff -N org/apache/lucene/search/phonetic/PhoneticEncoder.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/PhoneticEncoder.java 10 Jan 2004 00:41:33 -0000 @@ -0,0 +1,55 @@ + +/* + Phonetix: a class-library of phonetic algorithms. + Copyright (C) 2001-2003 Claus Engel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +package org.apache.lucene.search.phonetic; + +/** + * PhoneticEncoders generate one or more keys for a given word + * using a phonetic algorithm. The goal of each phonetic algorithm is to + * generate the same keys for words which have a similar pronunciation. + * + * @author Claus Engel + */ +public abstract class PhoneticEncoder +{ + /** + * Generates an array of keys. + * + * @param word the word for which the keys have to be generated. + * @return an array of keys. The keys of more importance are found + * at the smaller indices, i.e. the most important key is found + * at index zero. The array is never null, but of length + * zero, if the given word is null or the empty-string. + */ + public abstract String[] generateKeys (String word); + + /** + * Generates a key. If the underlying algorithm creates more + * than one key, the default key, i.e. the most important key, is returned. + * + * @param word the word for which the key has to be generated. + * @return a key. The result is never null, i.e. if + * the given word is null or the empty-string, + * then the empty-string is returned. + */ + public abstract String generateKey (String word); + + public static final String[] EMPTY_KEYS = new String[0]; +} + Index: org/apache/lucene/search/phonetic/PhoneticQuery.java =================================================================== RCS file: org/apache/lucene/search/phonetic/PhoneticQuery.java diff -N org/apache/lucene/search/phonetic/PhoneticQuery.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/PhoneticQuery.java 10 Jan 2004 00:41:33 -0000 @@ -0,0 +1,76 @@ +package org.apache.lucene.search.phonetic; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; +import org.apache.lucene.search.MultiTermQuery; + +import java.io.IOException; + +/** Implements the phonetic search query */ + +public class PhoneticQuery extends MultiTermQuery { + public PhoneticQuery(Term term) { + super(term); + } + protected FilteredTermEnum getEnum(IndexReader reader) throws IOException { + return new PhoneticTermEnum(reader, getTerm()); + } + public String toString(String field) { + return super.toString(field) + '$'; + } +} Index: org/apache/lucene/search/phonetic/PhoneticTermEnum.java =================================================================== RCS file: org/apache/lucene/search/phonetic/PhoneticTermEnum.java diff -N org/apache/lucene/search/phonetic/PhoneticTermEnum.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/PhoneticTermEnum.java 10 Jan 2004 00:41:34 -0000 @@ -0,0 +1,199 @@ +package org.apache.lucene.search.phonetic; + +/* ==================================================================== + * The Apache Software License, Version 1.1 + * + * Copyright (c) 2001 The Apache Software Foundation. All rights + * reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. The end-user documentation included with the redistribution, + * if any, must include the following acknowledgment: + * "This product includes software developed by the + * Apache Software Foundation (http://www.apache.org/)." + * Alternately, this acknowledgment may appear in the software itself, + * if and wherever such third-party acknowledgments normally appear. + * + * 4. The names "Apache" and "Apache Software Foundation" and + * "Apache Lucene" must not be used to endorse or promote products + * derived from this software without prior written permission. For + * written permission, please contact apache@apache.org. + * + * 5. Products derived from this software may not be called "Apache", + * "Apache Lucene", nor may "Apache" appear in their name, without + * prior written permission of the Apache Software Foundation. + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * ==================================================================== + * + * This software consists of voluntary contributions made by many + * individuals on behalf of the Apache Software Foundation. For more + * information on the Apache Software Foundation, please see + * . + */ + +import java.io.IOException; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.FilteredTermEnum; + +/** + * Subclass of FilteredTermEnum for enumerating all terms that match the + * specified phonetic filter term. + *

+ * Term enumerations are always ordered by Term.compareTo(). Each term in + * the enumeration is greater than all that precede it. + */ + +public class PhoneticTermEnum extends FilteredTermEnum { + + public static final char PHONETIC_CHAR = '$'; + + private boolean searchAll = false; + private boolean endTerm = false; + private String encoding; + private Term startTerm; + private float distance; + + // create a default Metaphone encoder with unlimited key length + // since this is static all readers will use the same encoder, unless the reader + // implements PhoneticTermProvider + + public static PhoneticEncoder defaultEncoder = new Metaphone(-1); + public static float minimumDistance = 1.0f; + + PhoneticEncoder encoder = null; + + /** + * Creates a new PhoneticTermEnum. Passing in a + * {@link org.apache.lucene.index.Term Term} that does not end with + * SOUNDEX_CHAR will cause an exception to be thrown. + */ + public PhoneticTermEnum(IndexReader reader, Term term) throws IOException { + super(reader, term); + + startTerm = term; + + if (reader instanceof PhoneticTermProvider) { + encoder = ((PhoneticTermProvider)reader).phoneticEncoder(); + setEnum(((PhoneticTermProvider) reader).phoneticTerms(term)); + } else { + encoder = defaultEncoder; + // if no provider, search all terms + searchAll = true; + encoding = encoder.generateKey(term.text()); + setEnum(reader.terms(new Term(term.field(), ""))); + } + } + + /** + * only match encoding keys that start with the encoding of the search term + */ + protected final boolean termCompare(Term term) { + if(!startTerm.field().equals(term.field())){ + endTerm = true; + return false; + } + + String key = encoder.generateKey(term.text()); + + if(key.equals(encoding)) + distance = 1.5f; + else if(encoding.startsWith(key)) + distance = 1; + else if(key.startsWith(encoding)) + distance = 1; + else + return false; + + // boost prefixes + if(term.text().startsWith(startTerm.text())) + distance += .5; + + int editdist = editDistance(term.text(),startTerm.text()); + int termlen = term.text().length(); + int startlen = startTerm.text().length(); + + distance = distance - ((float)editdist / (float)Math.min(termlen,startlen)); + +// System.err.println("term = "+term+", key = "+key+", distance = "+distance); + + if(distance. + */ + +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; + +/** + * IndexReader optimized phonetic search support, for those readers that can support it + */ +public interface PhoneticTermProvider { + /** + * return an enum for all terms that phonetically match a term + * + * @param t the starting term + * @return the TermEnum of matching Terms + */ + TermEnum phoneticTerms(Term t); + + /** + * return the phonetic encoder used by the reader, so differences can be calculated + * + * @return the phonetic encoder + */ + PhoneticEncoder phoneticEncoder(); +} Index: org/apache/lucene/search/phonetic/Soundex.java =================================================================== RCS file: org/apache/lucene/search/phonetic/Soundex.java diff -N org/apache/lucene/search/phonetic/Soundex.java --- /dev/null 1 Jan 1970 00:00:00 -0000 +++ org/apache/lucene/search/phonetic/Soundex.java 10 Jan 2004 00:41:34 -0000 @@ -0,0 +1,188 @@ + +/* + Phonetix: a class-library of phonetic algorithms. + Copyright (C) 2001-2003 Claus Engel + + This library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + This library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with this library; if not, write to the Free Software + Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +*/ +package org.apache.lucene.search.phonetic; + +/** + * Encoder implementing the phonetic algorithm "Soundex". + * Soundex is described in Donald Knuth's + * The Art of Computer Programming, Vol.3. + * + * @see DoubleMetaphone + * @see Metaphone + * @author Claus Engel + */ +public final class Soundex extends PhoneticEncoder +{ + private final boolean full; + private final int length; + + /** + * Constructs a Soundex encoder which generates keys of given length. + * @param full a flag which specifies, whether the first character has to + * to be encoded, or not. If false, then this encoder works + * like the original soundex described by Knuth, i.e. 'Knuth' + * will become 'K530'. If true, then this encoder will encode + * the first character too, i.e. 'Knuth' will become '2530'. + * @param length the length of the keys to generate. + */ + public Soundex (boolean full, int length) + { + this.full = full; + this.length = length; + } + + /** + * Constructs a Soundex encoder which generates keys of length 4. + * @param full a flag which specifies, whether the first character has to + * to be encoded, or not. If false, this encoder works + * like the original soundex described by Knuth, i.e. 'Knuth' + * will become 'K530'. If true, this encoder will encode + * the first character too, i.e. 'Knuth' will become '2530'. + */ + public Soundex (boolean full) + { + this (full, 4); + } + + /** + * Constructs an original Soundex encoder which generates keys of given length. + * @param length the length of the keys to generate. + */ + public Soundex (int length) + { + this (false, length); + } + + /** + * Constructs an original Soundex encoder which generates keys of length 4. + */ + public Soundex () + { + this(false, 4); + } + + /** + * Returns a String identifying the algorithm. + */ + public String toString() + { + return "Soundex_"+full+"_"+length; + } + + private static char getCode (final char c) + { + switch (Character.toLowerCase(c)) + { + case 'b': + case 'p': + case 'f': + case 'v': return '1'; + case 'c': + case 's': + case 'k': + case 'g': + case 'j': + case 'q': + case 'x': + case 'z': return '2'; + case 'd': + case 't': return '3'; + case 'l': return '4'; + case 'm': + case 'n': return '5'; + case 'r': return '6'; + } + return '*'; + } + + /** + * Returns the encoding of the given word. + * @param word the word to encode. + * @return an array with the encoding of the word. + * This is never null. + */ + public String[] generateKeys (final String word) + { + return (word!=null && word.length()>0) ? new String[] {generateKey(word)} : EMPTY_KEYS; + } + + /** + * Returns the encoding of the given word. + * @param word the word to encode. + * @return the encoding of the word. This is never null. + */ + public String generateKey (final String word) + { + if (length <= 0) + return ""; + + if (word == null || word.length() == 0) + return ""; + + final char[] chars = word.toCharArray(); + final StringBuffer result = new StringBuffer(length); + + int inIdx, outIdx; + char prevDigit; + if (full) + { + inIdx = outIdx = 0; + prevDigit = '*'; + } + else + { + inIdx = outIdx = 1; + result.append(Character.toUpperCase(chars[0])); + prevDigit = getCode(chars[0]); + } + + while (inIdx < chars.length && outIdx < length) + { + char c = getCode(chars[inIdx]); + + if (c!='*' && c!=prevDigit) + { + result.append(c); + outIdx++; + } + + prevDigit = c; + inIdx++; + } + + for (; outIdx < length; outIdx++) + result.append('0'); + + return result.toString(); + } + + /** + * Test of algorithm with default constructed encoder. + * The encoded arguments are printed to System.out. + */ + public static void main (String[] argv) + { + final Soundex generator = new Soundex(); + + for (int n = 0; n < argv.length; n++) + System.out.println(generator.generateKey(argv[n])); + } +} +