Index: contrib/didyoumean/pom.xml.template =================================================================== --- contrib/didyoumean/pom.xml.template (revision 0) +++ contrib/didyoumean/pom.xml.template (revision 0) @@ -0,0 +1,61 @@ + + + + + 4.0.0 + + org.apache.lucene + lucene-contrib + @version@ + + org.apache.lucene + lucene-didyoumean + Lucene query suggester + @version@ + + Extended spell checker with phrase support and adaptive user session analysis. + + jar + + + + berkeleydb + je + 3.2.44 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + + 1.5 + 1.5 + + + + + + + Index: contrib/didyoumean/lib/je-3.2.44.jar =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib/didyoumean/lib/je-3.2.44.jar ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultGoalTreeExtractor.java =================================================================== --- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultGoalTreeExtractor.java (revision 0) +++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultGoalTreeExtractor.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + + +import org.apache.lucene.search.didyoumean.impl.DefaultQueryGoalTreeExtractor; + +/** + * @author Karl Wettin + * Date: 2007-feb-02 + * Time: 01:03:25 + */ +public class TestDefaultGoalTreeExtractor extends TestQueryGoalExtractor { + + + protected org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor goalTreeExtractorFactory() { + return new DefaultQueryGoalTreeExtractor(); + } +} Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestGoalJuror.java =================================================================== --- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestGoalJuror.java (revision 0) +++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestGoalJuror.java (revision 0) @@ -0,0 +1,272 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import junit.framework.TestCase; +import org.apache.lucene.search.didyoumean.QuerySession; +import org.apache.lucene.search.didyoumean.SecondLevelSuggester; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.SuggestionFacade; + +import java.io.*; +import java.net.URL; +import java.util.Map; +import java.util.zip.GZIPInputStream; + +/** + * @author Karl Wettin + * Date: 2007-feb-24 + * Time: 00:13:05 + */ +public class TestGoalJuror extends TestCase { + + // private static Log log = LogFactory.getLog(TestGoalJuror.class); + // private static long serialVersionUID = 1l; + + private File dataRootPath = new File("data"); + + @Override + protected void setUp() throws Exception { + if (!dataRootPath.exists()) { + dataRootPath.mkdirs(); + } + } + + /** left out here for closure reasons */ + private SuggestionFacade facade; + + + public void testConcept() throws Exception { + + facade = new SuggestionFacade(new File(dataRootPath, "TestGoalJuror.testConcept/" + String.valueOf(System.currentTimeMillis()))); + + QuerySession session = facade.getQuerySessionManager().querySessionFactory(); + session.query("heroes of night and magic", 0, null, 1l); + session.query("heroes of knight and magic", 0, null, 2l); + session.query("heroes of might and magic", 10, null, 3l); + + assertTrue(session.isExpired()); + facade.getQuerySessionManager().getSessionsByID().put(session); + + facade.trainExpiredQuerySessions(); + + assertEquals("heroes of might and magic", facade.didYouMean("heroes of night and magic")); + assertEquals("heroes of might and magic", facade.didYouMean("heroes of knight and magic")); + assertEquals("heroes of might and magic", facade.didYouMean("heroes of might and magic")); + + facade.close(); + + } + + + public void testImportData() throws Exception { + + // load 3,500,000 user queries with session id, time stamp and number of hits. + // no goals specified. let the goal juror figure that out. + // all queries are lower cased. + + File facadePath = new File(dataRootPath, "backup"); + // File facadePath = new File(dataRootPath, "TestGoalJuror.testImportData/" + String.valueOf(System.currentTimeMillis()); + + facade = new SuggestionFacade(facadePath); + + + if (facade.getDictionary().getSuggestionsByQuery().map().size() == 0) { + + File testDataFile = new File(dataRootPath, "queries_grouped.txt"); + if (!testDataFile.exists()) { + System.out.println("Downloading test data..."); + InputStream input = new GZIPInputStream(new URL("http://ginandtonique.org/~kalle/LUCENE-626/queries_grouped.txt.gz").openStream()); + OutputStream output = new FileOutputStream(testDataFile); + byte[] buf = new byte[102400]; + int len; + while ((len = input.read(buf)) > -1) { + output.write(buf, 0, len); + } + input.close(); + output.close(); + } + + System.out.println("Importing test data..."); + + + int cnt = 0; + int total = 0; + + BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(testDataFile), "UTF8")); + String line; + + assertEquals(5, br.readLine().split("\t").length); + + String sessionID = ""; + QuerySession querySession = null; + + while ((line = br.readLine()) != null) { + String[] values = line.split("\t"); + if (!values[0].equals(sessionID)) { + if (querySession != null) { + facade.getQuerySessionManager().getSessionsByID().put(querySession); + if (cnt == 10000) { + + // throws a nasty exception! + // http://forums.oracle.com/forums/thread.jspa?threadID=577588&tstart=0 + +// // train while still creating... +// if (total == 200000) { +// new Thread(new Runnable() { +// public void run() { +// try { +// facade.trainExpiredQuerySessions(2); +// } catch (Exception e) { +// e.printStackTrace(); +// // todo +// } +// } +// }).start(); +// } + + System.out.println(total + " sessions loaded."); + cnt = 0; + } + } + querySession = new QuerySession(); + querySession.setId(values[0]); + sessionID = values[0]; + cnt++; + total++; + } + String query = values[4].trim().toLowerCase(); + int hits = Integer.valueOf(values[3]); + long timestamp = Long.valueOf(values[1]) * 1000; + querySession.query(query, hits, null, timestamp); + } + facade.getQuerySessionManager().getSessionsByID().put(querySession); + br.close(); + + facade.trainExpiredQuerySessions(4); + + System.out.println("Reopening persistent dictionary."); + facade.close(); + facade = new SuggestionFacade(facadePath); + + } + + + System.out.println("Running assertation tests..."); + // run some tests without the second level suggestions, + // i.e. user behavioral data only. no ngrams or so. + + assertEquals("pirates of the caribbean", facade.didYouMean("pirates ofthe caribbean")); + + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carribbean")); + + assertEquals("pirates caribbean", facade.didYouMean("pirates carricean")); + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carriben")); + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabien")); + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabbean")); + + assertEquals("pirates of the caribbean", facade.didYouMean("pirates og carribean")); + + assertEquals("pirates of the caribbean soundtrack", facade.didYouMean("pirates of the caribbean music")); + assertEquals("pirates of the caribbean score", facade.didYouMean("pirates of the caribbean soundtrack")); + + assertEquals("pirate of caribbean", facade.didYouMean("pirate of carabian")); + assertEquals("pirates of caribbean", facade.didYouMean("pirate of caribbean")); + assertEquals("pirates of caribbean", facade.didYouMean("pirates of caribbean")); + + // depening on how many hits and goals are noted with these two queries + // perhaps the delta should be added to a synonym dictionary? + assertEquals("homm iv", facade.didYouMean("homm 4")); + + + + + + // add token phrase suggester: requires a lot of memory!! + + // not yet known.. and we have no second level yet. + assertNull(facade.didYouMean("the pilates")); + + System.out.println("Creating second level suggesters from comon typos..."); + // use the dictionary built from user queries to build the token phrase and ngram suggester. + + facade.getDictionary().getPrioritesBySecondLevelSuggester().putAll(facade.secondLevelSuggestionFactory()); + System.out.println("Done creating second level suggesters from comon typos..."); + + assertEquals("the pirates", facade.didYouMean("the pilates")); + // now it's learned, should take no time. + assertEquals("the pirates", facade.didYouMean("the pilates")); + + + + + // typos + assertEquals("heroes of might and magic", facade.didYouMean("heroes of fight and magic")); + assertEquals("heroes of might and magic", facade.didYouMean("heroes of right and magic")); + assertEquals("heroes of might and magic", facade.didYouMean("heroes of magic and light")); + + // composite dictionary key not learned yet.. + assertEquals(null, facade.didYouMean("heroesof lightand magik")); + // learn + assertEquals("heroes of might and magic", facade.didYouMean("heroes of light and magik")); + // test + assertEquals("heroes of might and magic", facade.didYouMean("heroesof lightand magik")); + + // wrong term order + assertEquals("heroes of might and magic", facade.didYouMean("heroes of magic and might")); + + + System.out.println("Reopening persistent dictionary."); + facade.close(); + facade = new SuggestionFacade(facadePath); + System.out.println("Persistent dictionary reopened."); + + // same tests again, to make sure persistency works. + + // should be known by now + assertEquals("the pirates", facade.didYouMean("the pilates")); + + // run some tests without the second level suggestions, + // i.e. user behavioral data only. no ngrams or so. + + assertEquals("pirates of the caribbean", facade.didYouMean("pirates ofthe caribbean")); + + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carribbean")); + + assertEquals("pirates caribbean", facade.didYouMean("pirates carricean")); + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carriben")); + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabien")); + assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabbean")); + + assertEquals("pirates of the caribbean", facade.didYouMean("pirates og carribean")); + + assertEquals("pirates of the caribbean soundtrack", facade.didYouMean("pirates of the caribbean music")); + assertEquals("pirates of the caribbean score", facade.didYouMean("pirates of the caribbean soundtrack")); + + assertEquals("pirate of caribbean", facade.didYouMean("pirate of carabian")); + assertEquals("pirates of caribbean", facade.didYouMean("pirate of caribbean")); + assertEquals("pirates of caribbean", facade.didYouMean("pirates of caribbean")); + + // depening on how many hits and goals are noted with these two queries + // perhaps the delta should be added to a synonym dictionary? + assertEquals("homm iv", facade.didYouMean("homm 4")); + + + } + + +} Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestQueryGoalExtractor.java =================================================================== --- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestQueryGoalExtractor.java (revision 0) +++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestQueryGoalExtractor.java (revision 0) @@ -0,0 +1,103 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + + +import junit.framework.TestCase; + +import java.util.List; + +import org.apache.lucene.search.didyoumean.QueryGoalNode; + +/** + * @author Karl Wettin + * Date: Aug 4, 2006 + * Time: 6:41:38 AM + */ +public abstract class TestQueryGoalExtractor extends TestCase { + + private org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor goalTreeExtractor; + + @Override + protected void setUp() throws Exception { + goalTreeExtractor = goalTreeExtractorFactory(); + } + + protected abstract org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor goalTreeExtractorFactory(); + + public void testSingleNode() throws Exception { + assertEquals(1, goalTreeExtractor.extractGoalRoots(new QueryGoalNode(null, "heroes of knight and magic", 10, null, 1l)).size()); + + } + + public void testTime() throws Exception { + + QueryGoalNode node; + + QueryGoalNode session = new QueryGoalNode(null, "heroes of knight and magic", 10, null, 1l); + node = new QueryGoalNode(session, "heroes of knight and magic", 10, 0l); + node = new QueryGoalNode(node, "heroes of night and magic", 13, 10000l); + node = new QueryGoalNode(node, "heroes of might and magic", 132, 15000l); + node = new QueryGoalNode(node, "heroes of might and magic 3", 12, 25000l); + node.new Inspection(null, QueryGoalNode.MOO, 25000l); + node = new QueryGoalNode(node, "heroes of might and magic iv", 12, 100000l); + node = new QueryGoalNode(node, "heroes of might and magic 4", 6, 105000l); + node.new Inspection(null, QueryGoalNode.GOAL, 105000l); + + assertEquals(1, goalTreeExtractor.extractGoalRoots(session).size()); + + } + + public void testSimilarity() throws Exception { + + QueryGoalNode node; + QueryGoalNode session = new QueryGoalNode(null, "heroes of knight and magic", 10, null, 0l); + node = new QueryGoalNode(session, "heroes of night and magic", 13, 10000l); + node = new QueryGoalNode(node, "heroes of might and magic", 132, 15000l); + node = new QueryGoalNode(node, "heroes of might and magic 3", 12, 25000l); + + node = new QueryGoalNode(node, "the davinci code", 12, 27000l); + new QueryGoalNode(node, "the da vinci code", 6, 29000l); + + List> goalTreeRoots = goalTreeExtractor.extractGoalRoots(session); + + assertEquals(2, goalTreeRoots.size()); + assertEquals(1, goalTreeRoots.get(0).numChildrenRecursive()); + assertEquals(3, goalTreeRoots.get(1).numChildrenRecursive()); + + } + + public void testHits() throws Exception { + QueryGoalNode node; + QueryGoalNode session = new QueryGoalNode(null, "heroes of knight and magic", 10, null, 0l); + node = new QueryGoalNode(session, "heroes of night and magic", 13, 10000l); + node = new QueryGoalNode(node, "heroes of might and magic", 132, 15000l); + node = new QueryGoalNode(node, "heroes of might and magic 3", 0, 25000l); + + node = new QueryGoalNode(node, "the davinci code", 12, 27000l); + new QueryGoalNode(node, "the da vinci code", 6, 29000l); + + List> goalTreeRoots = goalTreeExtractor.extractGoalRoots(session); + + assertEquals(2, goalTreeRoots.size()); + assertEquals(1, goalTreeRoots.get(0).numChildrenRecursive()); + assertEquals(3, goalTreeRoots.get(1).numChildrenRecursive()); + + + } + +} Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultImplementation.java =================================================================== --- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultImplementation.java (revision 0) +++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultImplementation.java (revision 0) @@ -0,0 +1,235 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + + +import junit.framework.TestCase; +import org.apache.lucene.search.didyoumean.QueryGoalNode; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.SuggestionFacade; +import org.apache.lucene.search.didyoumean.secondlevel.token.SecondLevelTokenPhraseSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.NgramTokenSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.TermEnumIterator; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.index.facade.DirectoryIndexFacade; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.analysis.WhitespaceAnalyzer; +import org.apache.lucene.store.RAMDirectory; + +import java.io.File; + +/** + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 12:26:07 PM + */ +public class TestDefaultImplementation extends TestCase { + + private SuggestionFacade suggestionFacade; + + @Override + protected void setUp() throws Exception { + + suggestionFacade = new SuggestionFacade(new File("data/TestDefaultImplementation/" + String.valueOf(System.currentTimeMillis()))); + + // your primary index that suggestions must match. + IndexFacade aprioriIndex = new DirectoryIndexFacade(new RAMDirectory()); + aprioriIndex.indexWriterFactory(null, true).close(); + String aprioriField = "title"; + + // build the ngram suggester + IndexFacade ngramIndex = new DirectoryIndexFacade(new RAMDirectory()); + ngramIndex.indexWriterFactory(null, true).close(); + NgramTokenSuggester ngramSuggester = new NgramTokenSuggester(ngramIndex); + + IndexReader aprioriReader = aprioriIndex.indexReaderFactory(); + ngramSuggester.indexDictionary(new TermEnumIterator(aprioriReader, aprioriField)); + + // the greater the better results but with a longer response time. + int maxSuggestionsPerToken = 3; + + // add ngram suggester wrapped in a single token phrase suggester as second level suggester. + suggestionFacade.getDictionary().getPrioritesBySecondLevelSuggester().put(new SecondLevelTokenPhraseSuggester(ngramSuggester, aprioriField, false, maxSuggestionsPerToken, new WhitespaceAnalyzer(), aprioriIndex), 1d); + } + + public void testBasicTraining() throws Exception { + QueryGoalNode node; + + node = new QueryGoalNode(null, "heroes of nmight and magic", 3); + node = new QueryGoalNode(node, "heroes of night and magic", 3); + node = new QueryGoalNode(node, "heroes of might and magic", 10); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + node = new QueryGoalNode(null, "heroes of night and magic", 3); + node = new QueryGoalNode(node, "heroes of knight and magic", 7); + node = new QueryGoalNode(node, "heroes of might and magic", 20); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + node = new QueryGoalNode(null, "heroes of might and magic", 20, 1l); + suggestionFacade.trainSessionQueryTree(node); + + node = new QueryGoalNode(null, "heroes of night and magic", 7, 0l); + node = new QueryGoalNode(node, "heroes of light and magic", 14, 1l); + node = new QueryGoalNode(node, "heroes of might and magic", 2, 6l); + node.new Inspection(23, QueryGoalNode.GOAL); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + node = new QueryGoalNode(null, "heroes of night and magic", 4, 0l); + node = new QueryGoalNode(node, "heroes of knight and magic", 17, 1l); + node = new QueryGoalNode(node, "heroes of might and magic", 2, 2l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes of light and magic")); + assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes of night and magic")); + + assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes ofnight andmagic")); + + // todo fight has not been trained. use trie . + //assertEquals("heroes of might and magic", spellChecker.didYouMean(didYouMean.getDictionary(), "heroes of fight and magic")); + + + } + + + public void testSynonymTraining() throws Exception { + QueryGoalNode node; + + + node = new QueryGoalNode(null, "heroes of night and magic", 12, 0l); + node = new QueryGoalNode(node, "heroes of might and magic", 30, 1l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + node = new QueryGoalNode(null, "heroes of night and magic", 13, 0l); + node = new QueryGoalNode(node, "heroes of knight and magic", 14, 1l); + node = new QueryGoalNode(node, "heroes of might and magic", 22, 2l); + node.new Inspection(23, QueryGoalNode.GOAL); + node = new QueryGoalNode(node, "homm", 17, 4l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + + assertEquals("heroes of might and magic", suggestionFacade.didYouMean("homm")); + + + node = new QueryGoalNode(null, "homm", 17, 0l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + + node = new QueryGoalNode(null, "heroes of night and magic", 12, 0l); + node = new QueryGoalNode(node, "heroes of light and magic", 4, 1l); + node = new QueryGoalNode(node, "heroes of might and magic", 17, 2l); + node.new Inspection(23, QueryGoalNode.GOAL); + node = new QueryGoalNode(node, "homm", 34, 4l); + node.new Inspection(23, QueryGoalNode.GOAL); + node.new Inspection(24, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + + node = new QueryGoalNode(null, "heroes of night and magic", 3, 0l); + node = new QueryGoalNode(node, "heroes of knight and magic", 3, 1l); + node = new QueryGoalNode(node, "heroes of might and magic", 22, 2l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + assertEquals("homm", suggestionFacade.didYouMean("heroes of might and magic")); + assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes of night and magic")); + + // todo fight has not been trained. navigate the shortest way down the trie to find suggestion? + //assertEquals("heroes of might and magic", spellChecker.didYouMean(didYouMean.getDictionary(), "heroes of fight and magic")); + + assertEquals("homm", suggestionFacade.didYouMean("heroes of might and magic")); + assertEquals("heroes of might and magic", suggestionFacade.didYouMean("homm")); + + } + + public void testPrefixedSynonymTraining() throws Exception { + QueryGoalNode node; + + + node = new QueryGoalNode(null, "heroes of might and magic 2", 22, 0l); + node = new QueryGoalNode(node, "heroes of might and magic ii", 13, 1l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + + + node = new QueryGoalNode(null, "heroes of might and magic 2", 22, 0l); + node = new QueryGoalNode(node, "heroes of might and magic ii", 13, suggestionFacade.didYouMean("heroes of might and magic 2", 1)[0].getSuggested(), 1l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + + + node = new QueryGoalNode(null, "heroes of might and magic ii", 22, 0l); + node = new QueryGoalNode(node, "heroes of might and magic 2", 13, suggestionFacade.didYouMean("heroes of might and magic ii", 1)[0].getSuggested(), 1l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + node = new QueryGoalNode(null, "heroes of might and magic ii", 22, 0l); + node.new Inspection(23, QueryGoalNode.GOAL); + node = new QueryGoalNode(node, "heroes of might and magic 2", 13, suggestionFacade.didYouMean("heroes of might and magic ii", 1)[0].getSuggested(), 2l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + assertEquals("heroes of might and magic 2", suggestionFacade.didYouMean("heroes of might and magic ii")); + assertEquals("heroes of might and magic ii", suggestionFacade.didYouMean("heroes of might and magic 2")); + + } + + public void testNegativeAdaptation() throws Exception { + + testSynonymTraining(); + + + Suggestion aprioriSuggestion = suggestionFacade.getDictionary().getSuggestions("heroesofmightandmagic").get(0); + assertEquals("homm", aprioriSuggestion.getSuggested()); + + for (int i = 0; i < 10; i++) { + QueryGoalNode node; + node = new QueryGoalNode(null, "heroes of might and magic", 22, 0l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + } + + Suggestion resultSuggestion = suggestionFacade.getDictionary().getSuggestions("heroesofmightandmagic").get(0); + + assertFalse("homm".equals(resultSuggestion.getSuggested())); + + } + + + public void testWhiteSpace() throws Exception { + + QueryGoalNode node; + + node = new QueryGoalNode(null, "the davinci code", 22, 0l); + node = new QueryGoalNode(node, "the da vinci code", 13, 1l); + node.new Inspection(23, QueryGoalNode.GOAL); + suggestionFacade.trainSessionQueryTree(node); + + assertEquals("the da vinci code", suggestionFacade.didYouMean("thedavincicode")); + assertEquals("the da vinci code", suggestionFacade.didYouMean("the dav-inci code")); + + } + +} Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TestNgramTokenSuggester.java =================================================================== --- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TestNgramTokenSuggester.java (revision 0) +++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TestNgramTokenSuggester.java (revision 0) @@ -0,0 +1,190 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token.ngram; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import junit.framework.TestCase; +import org.apache.lucene.analysis.SimpleAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.*; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.index.facade.IndexWriterFacade; +import org.apache.lucene.index.facade.DirectoryIndexFacade; +import org.apache.lucene.store.RAMDirectory; +import org.apache.lucene.search.didyoumean.Suggestion; + +import java.io.IOException; + +/** + * @author Karl Wettin + * Date: 2007-feb-03 + * Time: 06:27:55 + */ +public class TestNgramTokenSuggester extends TestCase { + + // private static Log log = LogFactory.getLog(TestNgramTokenSuggester.class); + // private static long serialVersionUID = 1l; + + private NgramTokenSuggester ngramTokenSuggester; + private IndexFacade aprioriIndex; + + protected void setUp() throws Exception { + super.setUp(); + + //create a user index + aprioriIndex = new DirectoryIndexFacade(new RAMDirectory()); + IndexWriterFacade writer = aprioriIndex.indexWriterFactory(new SimpleAnalyzer(), true); + + for (int i = 0; i < 1000; i++) { + Document doc = new Document(); + doc.add(new Field("field1", intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED)); + doc.add(new Field("field2", intToEnglish(i + 1), Field.Store.YES, Field.Index.TOKENIZED)); + writer.addDocument(doc); + } + + + writer.close(); + + // create the spellChecker + IndexFacade ngramIndex = new DirectoryIndexFacade(new RAMDirectory()); + ngramIndex.indexWriterFactory(null, true).close(); + ngramTokenSuggester = new NgramTokenSuggester(ngramIndex); + } + + + public void testBuild() { + try { + + IndexReader reader = aprioriIndex.indexReaderFactory(); + + addwords(reader, "field1"); + int num_field1 = ngramTokenSuggester.getNgramReader().numDocs(); + + addwords(reader, "field2"); + int num_field2 = ngramTokenSuggester.getNgramReader().numDocs(); + + + assertEquals(num_field2, num_field1 + 1); + + // test small word + + assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fvie", 2).top()).getSuggested()); + assertEquals("nine", ((Suggestion)ngramTokenSuggester.suggest("five", 2).top()).getSuggested()); + assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fiv", 2).top()).getSuggested()); + assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("ive", 20).top()).getSuggested()); + assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fives", 20).top()).getSuggested()); + assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fie", 20).top()).getSuggested()); + assertEquals(0, ngramTokenSuggester.suggest("fi", 20).size()); + + // test restraint to a field + assertEquals(0, ngramTokenSuggester.suggest("tousand", 100, false, reader, "field1", false).size()); + assertEquals(1, ngramTokenSuggester.suggest("tousand", 100, false, reader, "field2", false).size()); + + } catch (IOException e) { + e.printStackTrace(); + fail(); + } + } + + + private void addwords(IndexReader r, String field) throws IOException { + long time = System.currentTimeMillis(); + ngramTokenSuggester.indexDictionary(new TermEnumIterator(r, field)); + time = System.currentTimeMillis() - time; + //System.out.println("time to build " + field + ": " + time); + } + + + public static String intToEnglish(int i) { + StringBuffer result = new StringBuffer(); + intToEnglish(i, result); + return result.toString(); + } + + public static void intToEnglish(int i, StringBuffer result) { + if (i == 0) { + result.append("zero"); + return; + } + if (i < 0) { + result.append("minus "); + i = -i; + } + if (i >= 1000000000) { // billions + intToEnglish(i/1000000000, result); + result.append("billion, "); + i = i%1000000000; + } + if (i >= 1000000) { // millions + intToEnglish(i/1000000, result); + result.append("million, "); + i = i%1000000; + } + if (i >= 1000) { // thousands + intToEnglish(i/1000, result); + result.append("thousand, "); + i = i%1000; + } + if (i >= 100) { // hundreds + intToEnglish(i/100, result); + result.append("hundred "); + i = i%100; + } + if (i >= 20) { + switch (i/10) { + case 9 : result.append("ninety"); break; + case 8 : result.append("eighty"); break; + case 7 : result.append("seventy"); break; + case 6 : result.append("sixty"); break; + case 5 : result.append("fifty"); break; + case 4 : result.append("forty"); break; + case 3 : result.append("thirty"); break; + case 2 : result.append("twenty"); break; + } + i = i%10; + if (i == 0) + result.append(" "); + else + result.append("-"); + } + switch (i) { + case 19 : result.append("nineteen "); break; + case 18 : result.append("eighteen "); break; + case 17 : result.append("seventeen "); break; + case 16 : result.append("sixteen "); break; + case 15 : result.append("fifteen "); break; + case 14 : result.append("fourteen "); break; + case 13 : result.append("thirteen "); break; + case 12 : result.append("twelve "); break; + case 11 : result.append("eleven "); break; + case 10 : result.append("ten "); break; + case 9 : result.append("nine "); break; + case 8 : result.append("eight "); break; + case 7 : result.append("seven "); break; + case 6 : result.append("six "); break; + case 5 : result.append("five "); break; + case 4 : result.append("four "); break; + case 3 : result.append("three "); break; + case 2 : result.append("two "); break; + case 1 : result.append("one "); break; + case 0 : result.append(""); break; + } + } + + + +} Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/TestTokenPhraseSuggester.java =================================================================== --- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/TestTokenPhraseSuggester.java (revision 0) +++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/TestTokenPhraseSuggester.java (revision 0) @@ -0,0 +1,104 @@ +package org.apache.lucene.search.didyoumean.secondlevel; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import junit.framework.TestCase; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.*; +import org.apache.lucene.index.facade.IndexWriterFacade; +import org.apache.lucene.index.facade.DirectoryIndexFacade; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.search.didyoumean.secondlevel.token.TokenPhraseSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.TokenPhraseSuggesterImpl; +import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.NgramTokenSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.TermEnumIterator; +import org.apache.lucene.store.RAMDirectory; + +import java.util.Collections; + +/** + * @author Karl Wettin + * Date: 2007-feb-02 + * Time: 04:36:49 + */ +public class TestTokenPhraseSuggester extends TestCase { + + // private static Log log = LogFactory.getLog(TestNgramPhraseSuggester.class); + // private static long serialVersionUID = 1l; + + + public void testPhraseSuggester() throws Exception { + + IndexFacade aprioriIndex = new DirectoryIndexFacade(new RAMDirectory()); + aprioriIndex.indexWriterFactory(null, true).close(); + + Analyzer analyzer = new StandardAnalyzer(Collections.EMPTY_SET); + final String field = "field"; + + // the apriori index - used to build ngrams and to check if suggestions are any good. + IndexWriterFacade indexWriter = aprioriIndex.indexWriterFactory(analyzer, true); + addDocument(indexWriter, field, "heroes of might and magic III complete", Field.TermVector.WITH_POSITIONS_OFFSETS); + addDocument(indexWriter, field, "it might be the best game ever made", Field.TermVector.WITH_POSITIONS_OFFSETS); + addDocument(indexWriter, field, "forget about the rest", Field.TermVector.WITH_POSITIONS_OFFSETS); + addDocument(indexWriter, field, "it's all in the fame", Field.TermVector.WITH_POSITIONS_OFFSETS); + addDocument(indexWriter, field, "lost in translation", Field.TermVector.WITH_POSITIONS_OFFSETS); + addDocument(indexWriter, field, "on the little hill there is a little tree, never have I seen such a little tree", Field.TermVector.NO); + indexWriter.close(); + + IndexReader reader = aprioriIndex.indexReaderFactory(); + + // the single token suggester + IndexFacade ngramIndex = new DirectoryIndexFacade(new RAMDirectory()); + ngramIndex.indexWriterFactory(null, true).close(); + + NgramTokenSuggester tokenSuggester = new NgramTokenSuggester(ngramIndex); + tokenSuggester.indexDictionary(new TermEnumIterator(reader, field), 2); + + // the phrase suggester backed by single token suggester + TokenPhraseSuggester phraseSuggester = new TokenPhraseSuggesterImpl(tokenSuggester, field, false, 3, analyzer, aprioriIndex); + + assertEquals("lost in translation", phraseSuggester.didYouMean("lost on translation")); + assertEquals("heroes might magic", phraseSuggester.didYouMean("magic light heros")); + assertEquals("heroes of might and magic", phraseSuggester.didYouMean("heros on light and magik")); + assertEquals("best game made", phraseSuggester.didYouMean("game best made")); + assertEquals("game made", phraseSuggester.didYouMean("made game")); + assertEquals("game made", phraseSuggester.didYouMean("made lame")); + assertEquals("the game", phraseSuggester.didYouMean("the game")); + assertEquals("in the fame", phraseSuggester.didYouMean("in the game")); + assertEquals("made", phraseSuggester.didYouMean("mede")); + assertEquals(0, phraseSuggester.suggest("may game", 3).size()); + + // make sure that non term positions act as they should + + assertEquals("lost in translation", phraseSuggester.didYouMean("in lost translation")); + assertEquals("have never i seen", phraseSuggester.didYouMean("have nevrer i seen")); + + reader.close(); + + + } + + private void addDocument(IndexWriterFacade indexWriter, String field, String text, Field.TermVector termVector) throws Exception { + Document document = new Document(); + document.add(new Field(field, text, Field.Store.YES, Field.Index.TOKENIZED, termVector)); + indexWriter.addDocument(document); + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/EditDistance.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/EditDistance.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/EditDistance.java (revision 0) @@ -0,0 +1,52 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author Karl Wettin + * Date: 2007-feb-17 + * Time: 03:08:24 + */ +public abstract class EditDistance { + protected final char[] sa; + + public EditDistance(String sa) { + this.sa = sa.toCharArray(); + } + + // private static Log log = LogFactory.getLog(EditDistance.class); + // private static long serialVersionUID = 1l; + + /** + * @param query string to measure distance to. + * @return distance to query + */ + public abstract int getDistance(String query); + + public double getNormalizedDistance(String query) { + return + 1.0d - ((double) getDistance(query) / Math + .min(query.length(), sa.length)); + + } + + public char[] getSa() { + return sa; + } + + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Trainer.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Trainer.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Trainer.java (revision 0) @@ -0,0 +1,54 @@ +package org.apache.lucene.search.didyoumean; + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * The analyzing part of a session analyzed dictionary. + * It build and updates the dictionary based on what users did during their visit. + * + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 4:46:13 PM + */ +public interface Trainer { + + /** + * @param dictionary the dictionary to update. + * @param goalTreeRoot the query goal tree to be used for training. + */ + public abstract void trainGoalTree(Dictionary dictionary, QueryGoalNode goalTreeRoot) throws DatabaseException; + + +// /** +// * Places a goal tree in the queue. +// * +// * @param goalTreeRoot goal tree to be queued. +// * @see org.apache.lucene.search.didyoumean.Trainer#flush(org.apache.lucene.search.didyoumean.dictionary.Dictionary) +// */ +// public abstract void queueGoalTree(QueryGoalNode goalTreeRoot); +// +// /** +// * Process all available goal trees in the queue. +// * +// * @param dictionary dictionary to work against +// */ +// public abstract void flush(Dictionary dictionary) throws DatabaseException; + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySessionManager.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySessionManager.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySessionManager.java (revision 0) @@ -0,0 +1,87 @@ +package org.apache.lucene.search.didyoumean; + +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; +import com.sleepycat.persist.EntityStore; +import com.sleepycat.persist.PrimaryIndex; +import com.sleepycat.persist.StoreConfig; + +import java.io.File; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-17 + * Time: 18:42:42 + */ +public class QuerySessionManager { + + private EntityStore store; + private PrimaryIndex sessionsByID; + + + public QuerySessionManager(File bdbPath) throws DatabaseException { + if (!bdbPath.exists()) { + System.out.println("Creating path " + bdbPath); + bdbPath.mkdirs(); + } + + EnvironmentConfig envConfig = new EnvironmentConfig(); + envConfig.setAllowCreate(true); + envConfig.setTransactional(false); + envConfig.setLocking(false); + Environment env = new Environment(bdbPath, envConfig); + + StoreConfig storeConfig = new StoreConfig(); + storeConfig.setAllowCreate(true); + storeConfig.setTransactional(false); + store = new EntityStore(env, "sessionManager", storeConfig); + + sessionsByID = store.getPrimaryIndex(String.class, QuerySession.class); + } + + public void close() throws DatabaseException { + store.close(); + } + + public EntityStore getStore() { + return store; + } + + public PrimaryIndex getSessionsByID() { + return sessionsByID; + } + + public synchronized QuerySession querySessionFactory() throws DatabaseException { + // ensure unique session id, todo better solution + try { + Thread.sleep(1); + } catch (InterruptedException ie) { + // whatever + } + return querySessionFactory(String.valueOf(System.currentTimeMillis())); + } + + public QuerySession querySessionFactory(String id) throws DatabaseException { + QuerySession querySession = new QuerySession(); + querySession.setId(id); + getSessionsByID().put(querySession); + return querySession; + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggester.java (revision 0) @@ -0,0 +1,41 @@ +package org.apache.lucene.search.didyoumean; + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * A suggester is the entity that navigates a dictionary in search for a suggestion to a query. + * + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 4:46:57 PM + */ +public interface Suggester { + + /** + * @param dictionary + * @param query + * @param n max number of suggestions @return + */ + public abstract Suggestion[] didYouMean(Dictionary dictionary, String query, int n) throws DatabaseException; + + public abstract String didYouMean(Dictionary dictionary, String query) throws DatabaseException ; + + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySession.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySession.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySession.java (revision 0) @@ -0,0 +1,160 @@ +package org.apache.lucene.search.didyoumean; + +import com.sleepycat.persist.model.Entity; +import com.sleepycat.persist.model.PrimaryKey; + +import java.util.ArrayList; +import java.util.List; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-17 + * Time: 17:24:47 + */ +@Entity +public class QuerySession { + + private static long defaultExpirationTimeMilliseconds = 1000 * 60 * 10; // 10 minutes + + @PrimaryKey + private String id; + + private long lastTouched = System.currentTimeMillis(); + private long expirationTimeMilliseconds = defaultExpirationTimeMilliseconds; + + private List> nodes = new ArrayList>(); + + /** + * Sets the last query node as parent, or null if no previous nodes. + * + * @param query user query + * @param corpusQueryResults number of hits + * @return query node index + */ + public Integer query(String query, Integer corpusQueryResults) { + return query(query, corpusQueryResults, null); + } + + + + /** + * Sets the last query node as parent, or null if no previous nodes. + * + * @param query user query + * @param corpusQueryResults number of hits + * @param suggestion suggestion to user query as given by suggester + * @return query node index + */ + public Integer query(String query, Integer corpusQueryResults, String suggestion) { + return query(query, corpusQueryResults, suggestion, System.currentTimeMillis()); + } + + /** + * @param parentNodeIndex parent query + * @param query user query + * @param corpusQueryResults number of hits + * @param suggestion suggestion to user query as given by suggester + * @return query node index + */ + public Integer query(Integer parentNodeIndex, String query, Integer corpusQueryResults, String suggestion) { + return query(parentNodeIndex, query, corpusQueryResults, suggestion, System.currentTimeMillis()); + } + + /** + * Sets the last query node as parent, or null if no previous nodes. + * + * @param query user query + * @param corpusQueryResults number of hits + * @param suggestion suggestion to user query as given by suggester + * @param timeStamp + * @return query node index + */ + public Integer query(String query, Integer corpusQueryResults, String suggestion, Long timeStamp) { + return query(nodes.size() == 0 ? null : nodes.size() - 1, query, corpusQueryResults, suggestion, timeStamp); + } + + /** + * + * @param parentNodeIndex parent query + * @param query user query + * @param corpusQueryResults number of hits + * @param suggestion suggestion to user query as given by suggester + * @param timeStamp + * @return query node index + */ + public synchronized Integer query(Integer parentNodeIndex, String query, Integer corpusQueryResults, String suggestion, Long timeStamp) { + nodes.add(new QueryGoalNode(parentNodeIndex == null ? null : nodes.get(parentNodeIndex), query, corpusQueryResults, suggestion, timeStamp)); + lastTouched = timeStamp; + return nodes.size() - 1; + } + + public void inspect(int nodeIndex, R reference, double goalClassification) { + inspect(nodeIndex, reference, goalClassification, System.currentTimeMillis()); + } + + public void inspect(int nodeIndex, R reference, double goalClassification, Long timeStamp) { + lastTouched = timeStamp; + nodes.get(nodeIndex).new Inspection(reference, goalClassification, timeStamp); + } + + public static long getDefaultExpirationTimeMilliseconds() { + return defaultExpirationTimeMilliseconds; + } + + public static void setDefaultExpirationTimeMilliseconds(long defaultExpirationTimeMilliseconds) { + QuerySession.defaultExpirationTimeMilliseconds = defaultExpirationTimeMilliseconds; + } + + public String getId() { + return id; + } + + public void setId(String id) { + this.id = id; + } + + public long getExpirationTimeMilliseconds() { + return expirationTimeMilliseconds; + } + + public void setExpirationTimeMilliseconds(long expirationTimeMilliseconds) { + this.expirationTimeMilliseconds = expirationTimeMilliseconds; + } + + + public List> getNodes() { + return nodes; + } + + public void setNodes(List> nodes) { + this.nodes = nodes; + } + + + public long getLastTouched() { + return lastTouched; + } + + public void setLastTouched(long lastTouched) { + this.lastTouched = lastTouched; + } + + public boolean isExpired() { + return getLastTouched() + getExpirationTimeMilliseconds() < System.currentTimeMillis(); + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionFacade.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionFacade.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionFacade.java (revision 0) @@ -0,0 +1,375 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.index.facade.IndexFacadeFactory; +import org.apache.lucene.index.facade.InstantiatedIndexFacade; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; +import org.apache.lucene.search.didyoumean.impl.DefaultAprioriCorpusFactory; +import org.apache.lucene.search.didyoumean.impl.DefaultQueryGoalTreeExtractor; +import org.apache.lucene.search.didyoumean.impl.DefaultSuggester; +import org.apache.lucene.search.didyoumean.impl.DefaultTrainer; +import org.apache.lucene.search.didyoumean.secondlevel.token.MultiTokenSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.SecondLevelTokenPhraseSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.NgramTokenSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.TermEnumIterator; +import org.apache.lucene.store.instantiated.InstantiatedIndex; + +import java.io.File; +import java.io.IOException; +import java.util.Collections; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentLinkedQueue; + +/** + * Consumer interface for the adaptive user session analyzing suggester. + *

+ * The simplest implementation would look something like this: + *

+ * SuggestionFacade facade = new SuggestionFacade(new File("data"));
+ * facade.getDictionary().getPrioritesBySecondLevelSuggester().putAll(facade.secondLevelSuggestionFactory());
+ * ...
+ * QuerySession session = facade.getQuerySessionManager().sessionFactory();
+ * ...
+ * String query = "heros of mght and magik";
+ * Hits hits = searcher.search(queryFactory(query));
+ * String suggested = facade.didYouMean(query);
+ * session.query(query, hits.length(), suggested);
+ * ...
+ * facade.getQuerySessionManager().getSessionsByID().put(session);
+ * ...
+ * facade.trainExpiredSessions();
+ * ...
+ * facade.close();
+ * 
+ *

+ * The trainer is fed with trees of {@link org.apache.lucene.search.didyoumean.QueryGoalNode} instances. Each such + * tree represent the events that took place while a user tried to find content within a certain context: a goal tree. + * An instance of {@link org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor} will help you to find and isolate + * all the goals in a tree representing a complete user session, as they sometimes contain more than one. + *

+ * It is up to the trainer and the suggester to decide how suggestions in the dictionary are stored and modified. Thus + * all trainers and suggesters might not be compatible with each other. + *

+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultQueryGoalTreeExtractor} + * {@link org.apache.lucene.search.didyoumean.impl.DefaultTrainer} + * {@link org.apache.lucene.search.didyoumean.impl.DefaultSuggester} + *

+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultAprioriCorpusFactory} + *

+ *

+ * + * @author Karl Wettin + * Date: 2007-feb-17 + * Time: 02:32:50 + */ +public class SuggestionFacade { + + private Dictionary dictionary; + private Suggester suggester; + + private QuerySessionManager querySessionManager; + private QueryGoalTreeExtractor queryGoalTreeExtractor; + + private Trainer trainer; + + private AprioriCorpusFactory aprioriCorpusFactory; + + public SuggestionFacade(File dataRootPath) throws DatabaseException { + this(new File(dataRootPath, "dictionary"), new File(dataRootPath, "querySessionManager")); + } + + public SuggestionFacade(File dictionaryPath, File querySessionManagerPath) throws DatabaseException { + this(dictionaryPath, querySessionManagerPath, new DefaultSuggester(), new DefaultTrainer(), new DefaultQueryGoalTreeExtractor(), new DefaultAprioriCorpusFactory()); + } + + public SuggestionFacade(File bdbRoot, Suggester suggester, Trainer trainer, QueryGoalTreeExtractor queryGoalTreeExtractor, AprioriCorpusFactory aprioriCorpusFactory) throws DatabaseException { + this(new File(bdbRoot, "dictionary"), new File(bdbRoot, "querySessionManager"), suggester, trainer, queryGoalTreeExtractor, aprioriCorpusFactory); + } + + + public SuggestionFacade(File dictionaryPath, File querySessionManagerPath, Suggester suggester, Trainer trainer, QueryGoalTreeExtractor queryGoalTreeExtractor, AprioriCorpusFactory aprioriCorpusFactory) throws DatabaseException { + this(new Dictionary(dictionaryPath), new QuerySessionManager(querySessionManagerPath), suggester, trainer, queryGoalTreeExtractor, aprioriCorpusFactory); + } + + public SuggestionFacade(Dictionary dictionary, QuerySessionManager querySessionManager, Suggester suggester, Trainer trainer, QueryGoalTreeExtractor queryGoalTreeExtractor, AprioriCorpusFactory aprioriCorpusFactory) throws DatabaseException { + this.dictionary = dictionary; + this.querySessionManager = querySessionManager; + this.suggester = suggester; + this.trainer = trainer; + this.queryGoalTreeExtractor = queryGoalTreeExtractor; + this.aprioriCorpusFactory = aprioriCorpusFactory; + + didYouMean("warmup"); // this is a bugfix + } + + + public void close() throws DatabaseException { + dictionary.close(); + querySessionManager.close(); + } + + public Suggestion[] didYouMean(String query, int n) throws DatabaseException { + return getSuggester().didYouMean(getDictionary(), query, n); + } + + public String didYouMean(String query) throws DatabaseException { + // todo remove debug + long ms = System.currentTimeMillis(); + String ret = getSuggester().didYouMean(getDictionary(), query); + ms = System.currentTimeMillis() - ms; + System.out.println(ms + "ms\t " + query + " -> " + ret); + return ret; + } + + /** + * Gathers and trains all expired query sessions from the query session manager + */ + public synchronized void trainExpiredQuerySessions() throws DatabaseException { + trainExpiredQuerySessions(1); + } + + /** + * Gathers and trains all expired query sessions from the query session manager + */ + public synchronized void trainExpiredQuerySessions(int maxThreads) throws DatabaseException { + trainExpiredQuerySessions(maxThreads, 10000); + } + + /** + * Gathers and trains all expired query sessions from the query session manager + */ + public synchronized void trainExpiredQuerySessions(int maxThreads, int batchSize) throws DatabaseException { + int count = 0; + for (; ;) { + + final ConcurrentLinkedQueue> queue = new ConcurrentLinkedQueue>(); + + for (QuerySession querySession : getQuerySessionManager().getSessionsByID().map().values()) { + if (querySession.isExpired()) { + count++; + queue.add((QuerySession) querySession); + if (queue.size() == batchSize) { + break; + } + } + } + + if (queue.size() == 0) { + break; + } + + Thread[] threads = new Thread[maxThreads]; + for (int i = 0; i < threads.length; i++) { + threads[i] = new Thread(new Runnable() { + public void run() { + QuerySession session; + while ((session = queue.poll()) != null) { + try { + trainSession(session); + getQuerySessionManager().getSessionsByID().delete(session.getId()); + } catch (DatabaseException dbe) { + dbe.printStackTrace(); + // todo + } + } + } + }); + threads[i].start(); + } + for (Thread thread : threads) { + try { + thread.join(); + } catch (InterruptedException ie) { + ie.printStackTrace(); + // todo + } + } + + System.out.println(count + " sessions trained."); + } + + System.out.println("Finished training a total of " + count + " sessions."); + + } + + /** + * Extracts multiple goal trees from a tree containing all queries in a session, + * and queues each such goal tree to the trainer. + * + * @param session the session to be trained + * @throws DatabaseException + */ + public void trainSession(QuerySession session) throws DatabaseException { + trainSessionQueryTree(session.getNodes().get(0)); + } + + /** + * Extracts multiple goal trees from a tree containing all queries in a session, + * and queues each such goal tree to the trainer. + * + * @param session any node (preferably the root) of a query goal tree + * @throws DatabaseException + */ + public void trainSessionQueryTree(QueryGoalNode session) throws DatabaseException { + for (QueryGoalNode goalTreeRoot : getQueryGoalTreeExtractor().extractGoalRoots(session.getRoot())) { + getTrainer().trainGoalTree(getDictionary(), goalTreeRoot); + } + } + + + /** + * Compiles algorithmic second level suggesters based on the data in the dictionary. + * + * @return + * @throws IOException + * @throws DatabaseException + */ + public Map secondLevelSuggestionFactory() throws IOException, DatabaseException { + return secondLevelSuggestionFactory( + new IndexFacadeFactory() { + public IndexFacade factory() throws IOException { + return new InstantiatedIndexFacade(new InstantiatedIndex()); + } + } + ); + } + + /** + * Compiles algorithmic second level suggesters based on the data in the dictionary. + * += * @param indexFacadeFactory index in which to store ngrams created by all terms in the a priori corpus + * @return a second level suggester + * @throws IOException + * @throws DatabaseException + */ + public Map secondLevelSuggestionFactory(IndexFacadeFactory indexFacadeFactory) throws IOException, DatabaseException { + return secondLevelSuggestionFactory(null, null, null, indexFacadeFactory, "apriori", indexFacadeFactory, 2, 7); + } + + /** + * Compiles algorithmic second level suggesters based on the data in the dictionary. + * + * @param systemIndex system index user queries are placed in. optional. + * @param systemIndexField field in system index used to extract ngram tokens. must not be null if systemIndex is available. + * @param systemNgramIndexFacadeFactory + * @param aprioriIndexFacadeFactory index in which to store the created a priori corpus + * @param aprioriField field in a priori index to store values + * @param aprioriNgramIndexFacadeFactory index in which to store ngrams created by all terms in the a priori corpus + * @param minNgramSize minimum ngram size. 2 makes sense. + * @param maxSuggestionsPerWord maximum number of suggestions per word in matrix. A maximum of n^w queries will be placed. + * @return a second level suggester + * @throws IOException + * @throws DatabaseException + */ + public Map secondLevelSuggestionFactory(IndexFacade systemIndex, String systemIndexField, IndexFacadeFactory systemNgramIndexFacadeFactory, IndexFacadeFactory aprioriIndexFacadeFactory, String aprioriField, IndexFacadeFactory aprioriNgramIndexFacadeFactory, int minNgramSize, int maxSuggestionsPerWord) throws IOException, DatabaseException { + if (systemIndex != null && systemIndexField == null) { + throw new NullPointerException("systemIndexField must be set if systemIndex is present."); + } + + Map ret = new HashMap(2); + + Analyzer analyzer = new StandardAnalyzer(Collections.emptySet()); + + + System.out.println("Creating a priori corpus..."); + IndexFacade aprioriIndex = aprioriIndexFacadeFactory.factory(); + + getAprioriCorpusFactory().factory(getDictionary(), getSuggester(), aprioriIndex, aprioriField, analyzer); + + System.out.println("Creating ngram index from a priori corpus terms..."); + IndexFacade aprioriNgramIndex = aprioriNgramIndexFacadeFactory.factory(); + aprioriNgramIndex.indexWriterFactory(null, true).close(); // reset + NgramTokenSuggester ngramTokenSuggester = new NgramTokenSuggester(aprioriNgramIndex); + IndexReader aprioriIndexReader = aprioriIndex.indexReaderFactory(); + ngramTokenSuggester.indexDictionary(new TermEnumIterator(aprioriIndexReader, aprioriField), minNgramSize); + aprioriIndexReader.close(); + + ret.put(new SecondLevelTokenPhraseSuggester(ngramTokenSuggester, aprioriField, false, maxSuggestionsPerWord, analyzer, aprioriIndex), 3d); + + if (systemIndex != null) { + System.out.println("Creating ngram index from system corpus terms..."); + IndexFacade systemNgramIndex = systemNgramIndexFacadeFactory.factory(); + systemNgramIndex.indexWriterFactory(null, true).close(); // reset + NgramTokenSuggester sysetmNgramTokenSuggester = new NgramTokenSuggester(systemNgramIndex); + IndexReader systemIndexReader = systemIndex.indexReaderFactory(); + ngramTokenSuggester.indexDictionary(new TermEnumIterator(systemIndexReader, systemIndexField), minNgramSize); + systemIndexReader.close(); + + ret.put(new MultiTokenSuggester(sysetmNgramTokenSuggester, systemIndexField, true, maxSuggestionsPerWord, analyzer, systemIndex), 1d); + } + + return ret; + } + + + public Dictionary getDictionary() { + return dictionary; + } + + public void setDictionary(Dictionary dictionary) { + this.dictionary = dictionary; + } + + public Suggester getSuggester() { + return suggester; + } + + public void setSuggester(Suggester suggester) { + this.suggester = suggester; + } + + public Trainer getTrainer() { + return trainer; + } + + public void setTrainer(Trainer trainer) { + this.trainer = trainer; + } + + public QueryGoalTreeExtractor getQueryGoalTreeExtractor() { + return queryGoalTreeExtractor; + } + + public void setQueryGoalTreeExtractor(QueryGoalTreeExtractor queryGoalTreeExtractor) { + this.queryGoalTreeExtractor = queryGoalTreeExtractor; + } + + + public QuerySessionManager getQuerySessionManager() { + return querySessionManager; + } + + public void setQuerySessionManager(QuerySessionManager querySessionManager) { + this.querySessionManager = querySessionManager; + } + + + public AprioriCorpusFactory getAprioriCorpusFactory() { + return aprioriCorpusFactory; + } + + public void setAprioriCorpusFactory(AprioriCorpusFactory aprioriCorpusFactory) { + this.aprioriCorpusFactory = aprioriCorpusFactory; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalNode.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalNode.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalNode.java (revision 0) @@ -0,0 +1,287 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import com.sleepycat.persist.model.Persistent; + +import java.io.Serializable; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; +import java.util.ArrayList; +import java.util.concurrent.CopyOnWriteArrayList; + +/** + * A node in a tree describing what queries, suggestions and inspection of results + * that a user undertook while searching for a specific thing. + * + * @author Karl Wettin + * Date: 2007-jan-31 + * Time: 06:23:42 + */ +@Persistent +public class QueryGoalNode { + + private QueryGoalNode parent; + private List> children = new ArrayList>(); + + /** time of query */ + private Long timestamp; + /** user query */ + private String query; + /** number of hits in corpus from query */ + private Integer corpusQueryResults; + /** the suggested text from the dictonary */ + private String suggestion; + + /** corpus query results inspected by user */ + private List inspections = new LinkedList(); + + /** bdb persistence */ + private QueryGoalNode() { + } + + public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults) { + this(parentNode, query, corpusQueryResults, null, System.currentTimeMillis()); + } + + public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults, Long timeStamp) { + this(parentNode, query, corpusQueryResults, null, timeStamp); + } + + + public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults, String suggestion) { + this(parentNode, query, corpusQueryResults, suggestion, System.currentTimeMillis()); + } + + public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults, String suggestion, Long timeStamp) { + this.parent = parentNode; + if (parentNode != null) { + parentNode.getChildren().add(this); + } + this.query = query; + this.corpusQueryResults = corpusQueryResults; + this.suggestion = suggestion; + this.timestamp = timeStamp; + } + + public Double calculateInspectionWeight() { + if (getInspections().size() == 0) { + return null; + } + double score = 1d; + for (Inspection inspection : getInspections()) { + score *= 1 + inspection.getGoalClassification(); + } + return score; + } + + public QueryGoalNode getRoot() { + QueryGoalNode node = this; + while (node.getParent() != null) { + node = node.getParent(); + } + return node; + } + + + public QueryGoalNode getParent() { + return parent; + } + + public void setParent(QueryGoalNode parent) { + this.parent = parent; + } + + public List> getChildren() { + return children; + } + + public void setChildren(List> children) { + this.children = children; + } + + public String getQuery() { + return query; + } + + public void setQuery(String query) { + this.query = query; + } + + public String getSuggestion() { + return suggestion; + } + + public void setSuggestion(String suggestion) { + this.suggestion = suggestion; + } + + public Long getTimestamp() { + return timestamp; + } + + public void setTimestamp(Long timestamp) { + this.timestamp = timestamp; + } + + + public Integer getcorpusQueryResults() { + return corpusQueryResults; + } + + public void setcorpusQueryResults(Integer corpusQueryResults) { + this.corpusQueryResults = corpusQueryResults; + } + + public List getInspections() { + return inspections; + } + + public void setInspections(List inspections) { + this.inspections = inspections; + } + + /** + * The user was really looking for Windows vista, but searched only for vista. + * Buena vista social club was found, but as this subject also was compelling + * the user went on and read about that instead. + */ + public static final Double NO_PART_OF_THE_GOAL = -1d; + + /** + * We have no clue to wether the inspected reference was a part of the goal or not. + */ + public static final Double MOO = 0d; + + /** + * To set this high value, the users should explicitally have told the system + * that they found just the thing they were looking for. + */ + public static final Double GOAL = 1d; + + @Persistent + public class Inspection { + /** reference to the inspected item. */ + private R reference; + /** time of inspection */ + private Long timeStamp; + /** wether or not this inspection was the goal of the query. */ + private double goalClassification; + + public Inspection(R reference, double goalClassification) { + this(reference, goalClassification, System.currentTimeMillis()); + } + + public Inspection(R reference, double goalClassification, Long timeStamp) { + this.reference = reference; + this.goalClassification = goalClassification; + this.timeStamp = timeStamp; + QueryGoalNode.this.getInspections().add(this); + } + + + public R getReference() { + return reference; + } + + public void setReference(R reference) { + this.reference = reference; + } + + public Long getTimeStamp() { + return timeStamp; + } + + public void setTimeStamp(Long timeStamp) { + this.timeStamp = timeStamp; + } + + public double getGoalClassification() { + return goalClassification; + } + + public void setGoalClassification(double goalClassification) { + this.goalClassification = goalClassification; + } + } + + public int numChildrenRecursive() { + int i = 0; + Iterator> it = iterateChildrenRecursive(); + while (it.hasNext()) { + it.next(); + i++; + } + return i; + } + + public Iterator> iterateChildrenRecursive() { + return new Iterator>() { + + Iterator> children = getChildren().iterator(); + Iterator> subChildren; + + Iterator> currentChildren; + + QueryGoalNode currentChild; + QueryGoalNode nextChild; + + public boolean hasNext() { + + if (nextChild != null) { + return true; + } + + while (true) { + if (subChildren == null) { + if (children.hasNext()) { + nextChild = children.next(); + subChildren = nextChild.iterateChildrenRecursive(); + currentChildren = children; // for remove + return true; + } else { + return false; + } + } else if (subChildren.hasNext()) { + nextChild = subChildren.next(); + currentChildren = subChildren; // for remove + return true; + } else { + subChildren = null; + } + } + } + + public QueryGoalNode next() { + hasNext(); + currentChild = nextChild; + nextChild = null; + return currentChild; + } + + public void remove() { + currentChildren.remove(); + } + }; + } + + public String toString() { + return getQuery() + " " + getcorpusQueryResults() + " " + getInspections().size(); + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/package.html =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/package.html (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/package.html (revision 0) @@ -0,0 +1,318 @@ + + + + + + + + +

Abstract

+ +

+ A dictionary of correctly and incorrectly spelled words + pointing at one or many weighted suggestions + created from previous user activity + and backed by algorithmic suggestions. +

+ +

Consumer interface

+ +

The simplest implementation would look something like this:

+ +
+SuggestionFacade facade = new SuggestionFacade(new File("data"));
+facade.getDictionary().getPrioritesBySecondLevelSuggester().putAll(facade.secondLevelSuggestionFactory());
+...
+QuerySession session = facade.getQuerySessionManager().sessionFactory();
+...
+String query = "heros of mght and magik";
+Hits hits = searcher.search(queryFactory(query));
+String suggested = facade.didYouMean(query);
+session.query(query, hits.length(), suggested);* ...
+facade.getQuerySessionManager().getSessionsByID().put(session);
+...
+facade.trainExpiredSessions();
+...
+facade.close();
+
+ +

Usage can be much more complex than that and would then yeild in better suggestions.

+ +

Rationale

+ +

+ These are the results from contemplating on the relationships between +

    +
  • the user a priori (what the users knows to be right)
  • +
  • user queries (what the user is searching for in the system corpus) and how they are changed during a session in + quest for one or many specific goals +
  • +
  • the results a user inspected and perhaps even classified by taking further actions
  • +
  • the system corpus (the information available to search in)
  • +
  • and the system a priori (things the system knows to be right, probably a part of the system corpus you trust to + be right. E.g. movie titles, but not user comments.) +
  • +
+

+ +

+ As a user progress in their search for data in a corpus, + by supplying queries, + accepting and decline suggestions, + manually changing queries + and inspect results in different ways, + the system tries to find the goals of the event tree + and suggest these goals to future users with similair behaviour. +

+ +

It is not completly wrong to call this a semi-retarded reinforcement learning strategy.

+ +

+ When a user requests a suggestion to a query prior unknown to the dictonary, + second level suggesters will attempt to come up with something. + These are usually algorithmic and not aware of the dictionary. +

+ +

+ In a young implementation, or in one with very little user load, + the dictionary will more or less act as a cache against the second level suggesters. +

+ +

+ Spell corrections, synonyms and acronyms + are the most common effects I have seen from an active adaptative suggester. +

+ +

+ As all queries ever placed are stored in the dictionary (until pruned) + and the dictionary key is a whitespace and punctuation free version of the query, + any decomposited or composited typo will be handled. + For instance, "leonardo da vinci" will get the key "leonardodavinci" and thus suggest + "leonardo da vinci" in case the input query is "leonardo davinci". +

+ +

Query goal trees

+ +

+ A user session could contain multiple quests for content. + For example: + first the user looks for the Apache licence, + spells it wrong, inspects different results, + and then the user searches for the author Ivan Goncharov. +

+ +

+ In this package they are called different query goals. +

+ +

+ It is up to the QueryGoalTreeExtractor implementations to decide what + events in a session are parts of the same goal, + as we don't want to suggest the user to check out Goncharov + when they are looking for the Apache license. +

+ +

+ In the default query goal tree extractor, + nodes are parts of the same goal as their parent when: +

    +
  • The queries are the same.
  • +
  • The user took a suggestion from the system.
  • +
  • The current and the parent queries are similair enough.
  • +
  • The queries was entered within short enough time.
  • +
+

+

+ User activities are kept track of in a tree of QueryGoalNode:s, + where each node contains a user query, + if the current query (node) was a suggestion to a previous user query (node), + what search results was further inspected, + when it happend, + and for how long. +

+ +

+ Figure A.
+ figA +
+ User first look for "heroes of knight and magic",
+ followed by "heroes of light and magic",
+ goes back to the first query (knight) and changes that to "heroes of night and magic",
+ followed by "heroes of might and magic".
+ The user adds a comment in the third hit
+ and then inspects the first hit,
+ see that "homm" is a common acronym and searches for that too.
+ Inspects two hits in homm
+

+ +

+ Figure B.
+ figB +

+ +

+ The tree is sent to a trainer that in this case guesses + "heroes of might and magic" and "homm" seems to be what the user was looking for. + All non-inspected results are then adapted to suggest the most similar goal. + The two final goals are adapted to suggest each other. +

+ + +

Adaptive training

+ +

+ Adaptive means that the suggestions to a query + depends on how users previously have been acting. + This means that the dictionary could be tampered with quite easy by hammering the trainer with data + and you should therefore try to train only with data from trusted users. +

+ +

+ The default trainer implementation works like this: +

    +
  • If a user accepts the suggestion made by the system, then the score for that suggestion is incresed. + (positive adaptation) +
  • +
  • If a user does not accept the suggestion made by the system, then the score for that suggestion is decreased. + (negative adaptation) +
  • +
  • + If the goal tree is a single query (perhaps with multiple inspections) + then adapt negative on suggestion once again. +
  • +
  • + Suggestions are the queries with inspections, ordered by the classification weight. + All the queries in the goal without inspections will be adpated positive with + the query with inspections that has the shortest edit distance. +
  • +
  • Suggests back from best goal to second best goal. homm -> heroes of might and magic -> homm
  • +
+

+ +

Suggesting

+ +

+ Suggestions are created by the suggester, that navigates a dictionary. + The default implementation works like this: +

    +
  • + Returns highest scoring suggestion available, + unless the score is lower than the suggestion supression threadshold. +
  • +
  • + If there are no suggestions available, the second level suggesters + registred to the dictionary are used to produce the suggestions. +
  • +
  • + If the top scoring suggestion is same as the query, + and the second best is not supressed below threadshold, + switch the top two suggestions. (This should probably be a setting.) +
  • +
+ Ignoring a suggestion 50 times or so with a DefaultTrainer + supresses a suggestions below the default threadshold. + If a user manually change a query to a suppresed value + and thus be suggested again if no other suggestion is considered more important. +

+ +

Second level suggesting

+ +

+ If the dictionary does not contain a suggestion for a given query, + it will be passed on to any available SecondLevelSuggester, + usually an algorithmic suggestion scheme + that hopefully can come up with something. + When a user accepts such a suggestion + it will be picked up by the adaptive trainer + and become a part of the dictionary. +

+ +

A priori corpus dependency

+ +

+ In order for second level suggesters to work they need a corpus that is known to be correct: + an a priori corpus. This ought to be a small index that only contain text people do not know how + to spell, i.e. it only consume clock ticks at suggestion time if it is filled with phrases + people always get right. +

+ +

A priori corpus extraction

+ +

+ By inverting the dictionary and extracting the tokens or phrases people have most problem spelling, + a pretty good a priori corpus can be created. This will however require a dictionary that has been + exposed to quite some user activity. +

+ +

Token suggesters

+ +

+ The lowest level of suggestion is single token suggestions, + and the default implementation is a refactor of the contrib/spellcheck. +

+ +

NgramTokenSuggester

+ +

A refactor of the Lucene crontrib/spellcheck single token ngram suggester

+ +

TokenPhraseSuggester

+ +

+ Combines the output from a single token suggester, applied to each token (word) in a phrase, + to create possible phrases. In essense this is the same thing as a SpanFuzzyQuery. +

+ +

+ For example, the user places the query "thh best game". + The matrix of similar tokens are: +

+  the best game
+  tho rest fame
+           lame
+
+ These can be combined as: +
+  tho best game
+  tho best fame
+  tho best lame
+  tho rest game
+  tho rest fame
+  tho rest lame
+  the best game
+  the best fame
+  the best lame
+  the rest game
+  the rest fame
+  the rest lame
+
+ A query is created for each combination to find valid suggestions in the a priori index. + In the default implementation SpanNearQueries are used. +

+

+ If any of the valid hits contains a TermPositionVector + it will be analyzed and suggest the query in the order of terms in the corpus. +

+ +

+ E.g. query "camel that broke the staw" is suggested with "straw that broke the camel" +

+ +

+ todo: if term offsets available and value stored, suggest that value (for cosmetic reasons). +

+ +

MultiTokenSuggester

+ +

+ Works similar to the TokenPhraseSuggester, but uses term queries rather than span near queries, + used when people place queries where the terms within the query really has nothing to do with each other. + This second level suggester should have a lower priority than the TokenPhraseSuggester. +

+ + +
+ + + Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalJuror.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalJuror.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalJuror.java (revision 0) @@ -0,0 +1,78 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.search.didyoumean.QueryGoalNode; + +import java.util.*; + +/** + * Sets the chronologycally last placed query as the goal. + * + * @see QueryGoalJuror + * @author Karl Wettin + * Date: 2007-feb-23 + * Time: 23:53:21 + */ +public class DefaultQueryGoalJuror implements QueryGoalJuror { + + // private static Log log = LogFactory.getLog(HitsBasedJuror.class); + // private static long serialVersionUID = 1l; + + /** + * Sets the chronologycally last placed query as the goal. + * @param goalTreeRootNode Query goal tree root + * @return the goal nodes + * @throws RuntimeException If query goal tree already contains one or more goals. + */ + public List> createGoals(QueryGoalNode goalTreeRootNode) { + + List> nodes = new ArrayList>(); + + for (Iterator> children = goalTreeRootNode.iterateChildrenRecursive(); children.hasNext();) { + QueryGoalNode node = children.next(); + for (QueryGoalNode.Inspection inspection : node.getInspections()) { + if (inspection.getGoalClassification() > QueryGoalNode.MOO) { + throw new RuntimeException("The query goal tree already contains one or more goals!"); + } + } + nodes.add(node); + } + nodes.add(goalTreeRootNode); + + + Collections.sort(nodes, new Comparator>() { + public int compare(QueryGoalNode queryGoalNode, QueryGoalNode queryGoalNode1) { + return queryGoalNode1.getTimestamp().compareTo(queryGoalNode.getTimestamp()); + } + + }); + + List> goalNodes = new LinkedList>(); + + if (nodes.get(0).getcorpusQueryResults() > 0) { + + nodes.get(0).new Inspection(null, QueryGoalNode.GOAL); + goalNodes.add(nodes.get(0)); + } + + return goalNodes; + + + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalTreeExtractor.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalTreeExtractor.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalTreeExtractor.java (revision 0) @@ -0,0 +1,240 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.search.didyoumean.EditDistance; +import org.apache.lucene.search.didyoumean.Levenshtein; +import org.apache.lucene.search.didyoumean.QueryGoalNode; + +import java.io.Serializable; +import java.util.*; +import java.util.concurrent.ConcurrentLinkedQueue; + +/** + *

+ * A not too advanced way to extract the goals from a session. + *

+ * Nodes are parts of the same goal as their parent when: + *

    + *
  • the queries are the same
  • + *
  • the suggestion to the parent query was followed
  • + *
  • the queries are similair enough
  • + *
  • the queries was entered within short enough time
  • + *
+ * + * @author Karl Wettin + * Date: Aug 1, 2006 + * Time: 5:01:14 PM + */ +public class DefaultQueryGoalTreeExtractor implements org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor, Serializable { + + private static final long serialVersionUID = 1l; + + public static final long DEFAULT_MAXIMUM_MILLISECONDS_BETWEEN_QUERIES = 20000; + public static final double DEFAULT_MINIMUM_SIMILARITY = 0.6f; + public static final int DEFAULT_MINIMUM_HITS_IN_FINAL_QUERY = 1; + public static final double DEFAULT_MINIMUM_HITS_RATIO = 0f; + + + private long maximumMillisecondsBetweenQueries = DEFAULT_MAXIMUM_MILLISECONDS_BETWEEN_QUERIES; + private double minimumSimilarity = DEFAULT_MINIMUM_SIMILARITY; + private int minimumHitsInFinalQuery = DEFAULT_MINIMUM_HITS_IN_FINAL_QUERY; + private double minimumHitsRatio = DEFAULT_MINIMUM_HITS_RATIO; + + private double inspectionWeightGoalThreadshold = 0d; + + private class GoalFactory { + private List> goalQueries = new LinkedList>(); + } + + public boolean isPartOfParentGoal(QueryGoalNode child) { + if (child.getParent() == null) { + // no parent + return false; + } else if (child.getQuery().equals(child.getParent().getSuggestion())) { + // the user followed a suggestion made by the system. + return true; + } else if (child.getQuery().equals(child.getParent().getQuery())) { + // it is the same query placed once more. + // todo: consider if this should have been merged when creating a new child in DefaultTrainer? + return true; + } else { + // check edit distance + + EditDistance editDistance = editDistanceFactory(child.getParent().getQuery()); + //double similarity = ((double) distance / previousLink.link.length()); + double similarity = 1.0f - ((double) editDistance.getDistance(child.getQuery()) / Math.min(child.getQuery().length(), child.getParent().getQuery().length())); + if (similarity >= getMinimumSimilarity()) { + // similair enough + return true; + } else if (child.getTimestamp() - child.getParent().getTimestamp() < getMaximumMillisecondsBetweenQueries()) { + // todo: consider time spent inspecting et c. + + // not similair enough. + // but fast enough between searches. + + // if the child or its children reached the goal // todo recursive + // then it is a part of the parent goal. + + Double weight = child.calculateInspectionWeight(); + if (weight == null) { + return false; + } else { + return weight > getInspectionWeightGoalThreadshold(); + } + + } + } + + return false; + } + + public EditDistance editDistanceFactory(String sd) { + return new Levenshtein(sd); + } + + + public List> extractGoalRoots(QueryGoalNode sessionRoot) { + + List> goalRoots = new LinkedList>(); + + Map, GoalFactory> goalFactoryByNodes = new HashMap, GoalFactory>(); + + + Queue> leafsQueue = new ConcurrentLinkedQueue>(); + for (Iterator> leafsIterator = sessionRoot.iterateChildrenRecursive(); leafsIterator.hasNext();) { + QueryGoalNode node = leafsIterator.next(); + if (node.getChildren().size() == 0) { + leafsQueue.add(node); + } + } + + if (leafsQueue.size() == 0) { + // this is just a single node. + goalRoots.add(sessionRoot); + } else while (leafsQueue.size() > 0) { + QueryGoalNode node = leafsQueue.poll(); + while (isPartOfParentGoal(node)) { + GoalFactory nodeGoalFactory = goalFactoryByNodes.get(node); + if (nodeGoalFactory == null) { + nodeGoalFactory = new GoalFactory(); + nodeGoalFactory.goalQueries.add(node); + goalFactoryByNodes.put(node, nodeGoalFactory); + } + + if (node.getParent() != null) { + GoalFactory parentGoalFactory = goalFactoryByNodes.get(node.getParent()); + if (parentGoalFactory != null) { + // the parent has already been added to a goal factory + // merge this goal factory with the parent goal factory. + for (QueryGoalNode childGoalNode : goalFactoryByNodes.get(node).goalQueries) { + goalFactoryByNodes.put(childGoalNode, parentGoalFactory); + parentGoalFactory.goalQueries.add(childGoalNode); + } + } else { + goalFactoryByNodes.put(node, nodeGoalFactory); + nodeGoalFactory.goalQueries.add(node.getParent()); + } + } + + node = node.getParent(); + } + + // make current node a new root + if (node.getParent() != null) { + leafsQueue.add(node.getParent()); // parent is a new goal leaf! + node.getParent().getChildren().remove(node); + node.setParent(null); + } + goalRoots.add(node); + } + + return goalRoots; + } + + /** + * @return maximun time in milliseconds between two queries to be considered part of the same correction sequence + */ + public final long getMaximumMillisecondsBetweenQueries() { + return maximumMillisecondsBetweenQueries; + } + + /** + * @param maximumMillisecondsBetweenQueries + * maximun time in milliseconds between two queries to be considered part of the same correction sequence + */ + public final void setMaximumMillisecondsBetweenQueries(long maximumMillisecondsBetweenQueries) { + this.maximumMillisecondsBetweenQueries = maximumMillisecondsBetweenQueries; + } + + /** + * @return minimum edit-distance between two queries in order to be considered part of the same correction sequence + */ + public final double getMinimumSimilarity() { + return minimumSimilarity; + } + + /** + * Using Levenshtein edit distance factory, + * anything less than 0.5 is crazy, + * 0.7 can be almost be considered fail-safe, + * but as this is an adapting spell checker some bad data is ok, + * so I set default to 0.6 + * + * @param minimumSimilarity minimum edit-distance between two queries in order to be considered part of the same correction sequence + */ + public final void setMinimumSimilarity(double minimumSimilarity) { + this.minimumSimilarity = minimumSimilarity; + } + + /** + * @return number of hits required in the final query of a correction sequence + */ + public final int getMinimumHitsInFinalQuery() { + return minimumHitsInFinalQuery; + } + + /** + * @param minimumHitsInFinalQuery number of hits required in the final query of a correction sequence + */ + public final void setMinimumHitsInFinalQuery(int minimumHitsInFinalQuery) { + this.minimumHitsInFinalQuery = minimumHitsInFinalQuery; + } + + /** + * @return minimum allowed ratio of the low and top number of hits of any two queries in a correction sequence. default to 2, that is twice as many hits in the top resulting query is required + */ + public final double getMinimumHitsRatio() { + return minimumHitsRatio; + } + + /** + * @param minimumHitsRatio minimum allowed ratio of the low and top number of hits of any two queries in a correction sequence. default to 2, that is twice as many hits in the top resulting query is required + */ + public final void setMinimumHitsRatio(double minimumHitsRatio) { + this.minimumHitsRatio = minimumHitsRatio; + } + + + public double getInspectionWeightGoalThreadshold() { + return inspectionWeightGoalThreadshold; + } + + public void setInspectionWeightGoalThreadshold(double inspectionWeightGoalThreadshold) { + this.inspectionWeightGoalThreadshold = inspectionWeightGoalThreadshold; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultTrainer.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultTrainer.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultTrainer.java (revision 0) @@ -0,0 +1,321 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.search.didyoumean.*; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; +import org.apache.lucene.search.didyoumean.dictionary.SuggestionList; + +import java.util.*; + +import com.sleepycat.je.DatabaseException; + +/** + * A simple adapting suggestion strategy that updates the content of a dictionary based on what a query session looked like. + *

+ * If a user accepts the suggestion made by the system, then we increase the score for that suggestion. (positive adaptation) + * If a user does not accept the suggestion made by the system, then we decrease the score for that suggestion. (negative adaptation) + *

+ * The the goal tree is a single query, one query only (perhaps with multiple inspections) + * then we adapt negative once again. + *

+ * Suggestions are the query with inspections, ordered by the classification weight. + * All the queries in the goal witout inspections will be adpated positive with + * the query with inspections that has the shortest edit distance. + *

+ * Suggest back from best goal to second best goal. homm -> heroes of might and magic -> homm + *

+ * In case of no inspections in a goal tree, a {@link org.apache.lucene.search.didyoumean.impl.QueryGoalJuror} is used to judge what is the goal. + * + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 4:56:13 PM + */ +public class DefaultTrainer implements Trainer { + + private boolean trainingFinalGoalAsSelf = true; + + private double notSuggestedPositiveAdaptationFactor = 1.4d; + private double acceptedSuggestionPositiveAdaptationFactor = 1.4d; + private double ignoredSuggestionNegativeAdaptationFactor = 0.9d; + + private QueryGoalJuror juror = new DefaultQueryGoalJuror(); + + private static Comparator scoreComparator = new Comparator() { + public int compare(Suggestion o1, Suggestion o2) { + return Double.compare(o2.getScore(), o1.getScore()); + } + }; + + + public void trainGoalTree(Dictionary dictionary, QueryGoalNode goalTreeRoot) throws DatabaseException { + + int numChildrenRecursive = 0; + // positive and negative adaptation of suggestion scores + for (Iterator> it = goalTreeRoot.iterateChildrenRecursive(); it.hasNext();) { + numChildrenRecursive++; + QueryGoalNode node = it.next(); + if (node.getParent().getSuggestion() != null) { + + SuggestionList suggestions = dictionary.getSuggestions(dictionary.keyFormatter(node.getParent().getQuery())); + Suggestion suggestion = suggestions.get(node.getParent().getSuggestion()); + + if (node.getQuery().equals(node.getParent().getSuggestion())) { + // user took our suggestion, increase the score of that suggestion. + suggestion.setScore(suggestion.getScore() * getAcceptedSuggestionPositiveAdaptationFactor()); + } else { + // user did not take our suggestion, decrease the score of that suggestion. + suggestion.setScore(suggestion.getScore() * getIgnoredSuggestionNegativeAdaptationFactor()); + } + + suggestions.sort(); + dictionary.getSuggestionsByQuery().put(suggestions); + + + } + } + + if (numChildrenRecursive == 0) { + + // a single query + + boolean train = false; + for (QueryGoalNode.Inspection inspection : goalTreeRoot.getInspections()) { + if (inspection.getGoalClassification() > QueryGoalNode.MOO) { + train = true; + break; + } + } + if (train) { + adaptPositive(dictionary, goalTreeRoot.getQuery(), goalTreeRoot.getcorpusQueryResults(), goalTreeRoot); + } +// +// // if inspected, negative train on top suggestion, +// // but only if available and not the same as the query +// SuggestionList suggestions = dictionary.getSuggestions(goalTreeRoot.getQuery()); +// if (goalTreeRoot.getInspections().size() > 0 && suggestions != null && suggestions.size() > 0 && !suggestions.get(0).getSuggested().equals(goalTreeRoot.getQuery())) { +// suggestions.get(0).setPopularity(suggestions.get(0).getPopularity() * getIgnoredSuggestionNegativeAdaptationFactor()); +// suggestions.sort(); +// } + } else { + + /** suggestions are the nodes with inspections, ordered by the classification weight. */ + List> nodesWithGoals = new LinkedList>(); + + /** all the other nodes, them without inspections, will suggest the suggestions. */ + List> nodesWithoutGoals = new LinkedList>(); + + if (goalTreeRoot.getInspections().size() > 0) { + nodesWithGoals.add(goalTreeRoot); + } else { + nodesWithoutGoals.add(goalTreeRoot); + } + for (Iterator> it = goalTreeRoot.iterateChildrenRecursive(); it.hasNext();) { + QueryGoalNode node = it.next(); + if (node.getInspections().size() > 0) { + for (QueryGoalNode.Inspection inspection : node.getInspections()) { + if (inspection.getGoalClassification() > QueryGoalNode.MOO) { + nodesWithGoals.add(node); + break; + } + } + } + + if (!nodesWithGoals.contains(node)) { + nodesWithoutGoals.add(node); + } + } + + if (nodesWithGoals.size() == 0) { + // there was no inspections. + // we have no clue what the users was looking for. not sure if this mean something extra. + + // call juror strategy + nodesWithGoals = getJuror().createGoals(goalTreeRoot); + nodesWithoutGoals.removeAll(nodesWithGoals); + if (nodesWithGoals.size() == 0) { + return; + } + } + + { + // order by inspection weight and time - most recent query with top importance. + Collections.sort(nodesWithGoals, new Comparator>() { + public int compare(QueryGoalNode queryGoalNode, QueryGoalNode queryGoalNode1) { + int ret = Double.compare(queryGoalNode1.calculateInspectionWeight(), queryGoalNode.calculateInspectionWeight()); + if (ret == 0) { + // todo: consider by number of hits? does the same inspection weight but more hits mean something better or something worse? + + // by time + if (queryGoalNode1.getTimestamp() == queryGoalNode.getTimestamp()) { + return 0; + } else if (queryGoalNode1.getTimestamp() > queryGoalNode.getTimestamp()) { + return 1; + } else { + return -1; + } + } else { + return ret; + } + } + }); + + if (isTrainingFinalGoalAsSelf()) { + // we also train the suggestion as suggested self, + // as we store the dictionary key stripped from characters. + // this will suggest "the da vinci code" from "thedavincicode" and "the davinci code". + + // it is however not needed to suppres bad suggestions from beeing suggested. + // for that its enough with a low enough suggestion score detected by the suggester. + + for (QueryGoalNode node : nodesWithGoals) { + + // but only register it once. + SuggestionList suggestions = dictionary.getSuggestions(node.getQuery()); + if (suggestions == null) { + suggestions = dictionary.suggestionListFactory(node.getQuery()); + } + + if (!suggestions.containsSuggested(nodesWithGoals.get(0).getQuery())) { + suggestions.addSuggested(nodesWithGoals.get(0).getQuery(), 1d, nodesWithGoals.get(0).getcorpusQueryResults()); + dictionary.getSuggestionsByQuery().put(suggestions); + } + + // uncomment to adapt every time + // adaptPositive(dictionary, nodesWithGoals.get(0).getQuery(), nodesWithGoals.get(0).getCorpusQueryResults(), node); + } + } + + // suggest back from best goal to second best goal. homm -> heroes of might and magic -> homm + if (nodesWithGoals.size() > 1) { + adaptPositive(dictionary, nodesWithGoals.get(1).getQuery(), nodesWithGoals.get(1).getcorpusQueryResults(), nodesWithGoals.get(0)); + } + + // node without inspections are suggested to try the node with inspections that has least edit distance to the query. + for (QueryGoalNode node : nodesWithoutGoals) { + QueryGoalNode closestNode = findNodeWithShortestDistanceToQuery(node, nodesWithGoals); + adaptPositive(dictionary, closestNode.getQuery(), closestNode.getcorpusQueryResults(), node); + } + } + } + } + + + public EditDistance editDistanceFactory(String sd) { + return new Levenshtein(sd); + } + + + private QueryGoalNode findNodeWithShortestDistanceToQuery(QueryGoalNode queryNode, Collection> targetNodes) { + EditDistance editDistance = editDistanceFactory(queryNode.getQuery()); + QueryGoalNode closest = null; + double closestDistance = Double.MAX_VALUE; + for (QueryGoalNode targetNode : targetNodes) { + double distanceToTarget = editDistance.getDistance(targetNode.getQuery()); + if (distanceToTarget < closestDistance) { + closestDistance = distanceToTarget; + closest = targetNode; + } + } + return closest; + } + + private void adaptPositive(Dictionary dictionary, String suggested, Integer suggestedCorpusQueryResults, QueryGoalNode dictionaryKeyNode) throws DatabaseException { + SuggestionList suggestions = dictionary.getSuggestions(dictionaryKeyNode.getQuery()); + if (suggestions == null) { + suggestions = dictionary.suggestionListFactory(dictionaryKeyNode.getQuery()); + suggestions.addSuggested(suggested, 1d, suggestedCorpusQueryResults); + } else { + boolean suggestionUpdated = false; + for (Suggestion existingSuggestion : suggestions) { + if (existingSuggestion.getSuggested().equals(suggested)) { + // the query already have this suggestion in the suggestions. + // increase the score for the suggestion. (positive adaptation) + double score = existingSuggestion.getScore() * getNotSuggestedPositiveAdaptationFactor(); + if (score > 9999) { + score = 9999; + } + existingSuggestion.setScore(score); + suggestionUpdated = true; + break; + } + } + if (!suggestionUpdated) { + suggestions.addSuggested(suggested, 1d, suggestedCorpusQueryResults); + } + suggestions.sort(); + } + dictionary.getSuggestionsByQuery().put(suggestions); + } + + + public double getAcceptedSuggestionPositiveAdaptationFactor() { + return acceptedSuggestionPositiveAdaptationFactor; + } + + public void setAcceptedSuggestionPositiveAdaptationFactor(double acceptedSuggestionPositiveAdaptationFactor) { + this.acceptedSuggestionPositiveAdaptationFactor = acceptedSuggestionPositiveAdaptationFactor; + } + + public double getIgnoredSuggestionNegativeAdaptationFactor() { + return ignoredSuggestionNegativeAdaptationFactor; + } + + public void setIgnoredSuggestionNegativeAdaptationFactor(double ignoredSuggestionNegativeAdaptationFactor) { + this.ignoredSuggestionNegativeAdaptationFactor = ignoredSuggestionNegativeAdaptationFactor; + } + + public double getNotSuggestedPositiveAdaptationFactor() { + return notSuggestedPositiveAdaptationFactor; + } + + public void setNotSuggestedPositiveAdaptationFactor(double notSuggestedPositiveAdaptationFactor) { + this.notSuggestedPositiveAdaptationFactor = notSuggestedPositiveAdaptationFactor; + } + + public boolean isTrainingFinalGoalAsSelf() { + return trainingFinalGoalAsSelf; + } + + public void setTrainingFinalGoalAsSelf(boolean trainingFinalGoalAsSelf) { + this.trainingFinalGoalAsSelf = trainingFinalGoalAsSelf; + } + + public static Comparator getScoreComparator() { + return scoreComparator; + } + + public static void setScoreComparator(Comparator scoreComparator) { + DefaultTrainer.scoreComparator = scoreComparator; + } + + + /** + * @return Stragegy used when a query goal tree has no goals. + */ + public QueryGoalJuror getJuror() { + return juror; + } + + /** + * @param juror Stragegy used when a query goal tree has no goals. + */ + public void setJuror(QueryGoalJuror juror) { + this.juror = juror; + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/QueryGoalJuror.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/QueryGoalJuror.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/QueryGoalJuror.java (revision 0) @@ -0,0 +1,37 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.search.didyoumean.QueryGoalNode; + +import java.util.List; + +/** + * When a query goal tree has no goals, an implementation of this interface is used to set them up. + * + * @author Karl Wettin + * Date: 2007-feb-24 + * Time: 02:25:25 + */ +public interface QueryGoalJuror { + + /** + * @param goalTreeRootNode Query goal tree root node. + * @return the goal nodes + */ + public abstract List> createGoals(QueryGoalNode goalTreeRootNode); +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultSuggester.java (revision 0) @@ -0,0 +1,232 @@ +package org.apache.lucene.search.didyoumean.impl; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.search.didyoumean.AbstractSuggester; +import org.apache.lucene.search.didyoumean.EditDistance; +import org.apache.lucene.search.didyoumean.Levenshtein; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; +import org.apache.lucene.search.didyoumean.dictionary.SuggestionList; + +import java.io.Serializable; +import java.util.Collections; +import java.util.Comparator; +import java.util.LinkedList; +import java.util.List; + +/** + * Returns highest scoring suggestion available, + * unless the score is lower than the suggestion supression threadshold. + *

+ * If there are no suggestions available, the second level suggesters + * registred to the dictionary are used to produce the suggestions. + *

+ * If the top scoring suggestion is same as the query, + * and the second best is not supressed below threadshold, + * change order + *

+ * Ignoring a suggestion 50 times or so with a DefaultTrainer makes a score hit 0.05d. + *

+ * It will navigate towards better suggestions this way: + *

+ *      Suggestion[] currentSuggestions = toQuerySensitiveArray(suggestions, n, query);
+ *      Suggestion[] topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested());
+ *      // if the input query results in bad results according to some strategy,
+ *      // then attempt to navigate towards a top suggestion that does result in something good,
+ *      while (
+ *          topSuggestionSuggestions != null && topSuggestionSuggestions.length > 0
+ *              && (currentSuggestions[0].getCorpusQueryResults().size() == 0
+ *              || currentSuggestions[0].getCorpusQueryResults().size() * 3 < topSuggestionSuggestions[0].getCorpusQueryResults().size())) {
+ *        currentSuggestions = topSuggestionSuggestions;
+ *        topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested());
+ *      }
+ * 
+ * todo: above should be an aggregated strategy! + * + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 4:56:34 PM + */ +public class DefaultSuggester extends AbstractSuggester implements Serializable { + + private static final long serialVersionUID = 1l; + + + public DefaultSuggester() { + } + + /** + * ignoring a suggestion 50 times or so makes hit 0.05d. + */ + private double suggestionSupressionThreadshold = 0.05d; + + private SuggestionList gatherSuggestionList(Dictionary dictionary, String query, int n) throws DatabaseException { + SuggestionList suggestions = dictionary.getSuggestions(query); + if (suggestions != null && suggestions.size() > 0) { + // if top suggestion is suppressed, + // then try to get some more suggestions from second level + if (suggestions.get(0).getScore() < getSuggestionSupressionThreadshold()) { + Suggestion[] secondLevel = dictionary.getSecondLevelSuggestion(query, n); + if (secondLevel != null && secondLevel.length > 0) { + // then merge the not yet seen suggestions with the dictionary suggestions. + // this way we don't suggest something that has been suppressed. + double score = 1d; + for (Suggestion suggestion : secondLevel) { + score -= 0.01d; + if (!suggestions.containsSuggested(suggestion.getSuggested())) { + suggestions.addSuggested(suggestion.getSuggested(), score, suggestion.getCorpusQueryResults()); + dictionary.getSuggestionsByQuery().put(suggestions); + } + } + } + } + } + return suggestions; + } + + public Suggestion[] didYouMean(Dictionary dictionary, String query, int n) throws DatabaseException { + SuggestionList suggestions = gatherSuggestionList(dictionary, query, n); + if (suggestions != null) { + if (suggestions.size() > 0) { + Suggestion[] originalSuggestions = toQuerySensitiveArray(suggestions, n, query); + Suggestion[] currentSuggestions = originalSuggestions; + Suggestion[] topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested()); + // if the input query results in bad results according to some strategy, + // then attempt to navigate towards a top suggestion that does result in something good, + int noEternalLoops = 0; + while (hasBetterNestedSuggestion(topSuggestionSuggestions, currentSuggestions)) { + // no eternal loops! + if (++noEternalLoops == 100) { + System.out.println(query + " points back at it self or is too complex!"); + return originalSuggestions; + } + currentSuggestions = topSuggestionSuggestions; + topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested()); + } + return currentSuggestions; + } else { + return dictionary.getSecondLevelSuggestion(query, n); + } + + } else { + return dictionary.getSecondLevelSuggestion(query, n); + } + } + + /** + * If topSuggestionSuggestions points at something + * that has three times as many results as the top current suggestion + * then this method returns true. + * + * if the current suggestion has no hits + * and there is a suggestion with hits in topSuggestionSuggestions + * this method returns true + * + * @param topSuggestionSuggestions suggestions to first suggestion in currentSuggestions + * @param currentSuggestions suggestions to the current query + * @return true if topSuggestionSuggestions has a better suggestion than currentSuggestions + */ + public boolean hasBetterNestedSuggestion(Suggestion[] topSuggestionSuggestions, Suggestion[] currentSuggestions) { + return topSuggestionSuggestions != null + && topSuggestionSuggestions.length > 0 + && currentSuggestions[0].getCorpusQueryResults() != null + && + ((currentSuggestions[0].getCorpusQueryResults() == 0 && topSuggestionSuggestions[0].getCorpusQueryResults() > 0) + || (currentSuggestions[0].getCorpusQueryResults() * 3 < topSuggestionSuggestions[0].getCorpusQueryResults()) + ); + } + + + public EditDistance editDistanceFactory(String sd) { + return new Levenshtein(sd); + } + + + private Suggestion[] toQuerySensitiveArray(SuggestionList suggestions, int n, String query) { + // build an array that is max n in size. + + LinkedList ret = new LinkedList(); + suggestions.filter(ret, new SuggestionList.SuggestionFilter() { + + public boolean accept(Suggestion suggestion) { + return suggestion.getScore() > getSuggestionSupressionThreadshold(); + } + }); + + + if (ret.size() > 1) { + + // if the top scoring suggestion is same as the query + if (query.equals(ret.get(0).getSuggested())) { + + // switch top two suggestions + Suggestion s = ret.get(0); + ret.set(0, ret.get(1)); + ret.set(1, s); + + // if any other suggestion has many more hits + // then set that as top suggestion. + + EditDistance ed = editDistanceFactory(ret.get(0).getSuggested()); + List topHits = new LinkedList(); + if (ret.size() > 2) { + for (int i = 2; i < ret.size(); i++) { + if (ret.get(i).getCorpusQueryResults() > ret.get(0).getCorpusQueryResults() + && ed.getNormalizedDistance(ret.get(i).getSuggested()) > 0.8d /* todo setting */) { + topHits.add(ret.get(i)); + } + } + } + if (topHits.size() > 0) { + + if (topHits.size() > 1) { + Collections.sort(topHits, new Comparator() { + public int compare(Suggestion suggestion, Suggestion suggestion1) { + return new Integer(suggestion.getCorpusQueryResults()).compareTo(suggestion1.getCorpusQueryResults()); + } + }); + } + ret.remove(topHits.get(0)); + ret.addFirst(topHits.get(0)); + } + } +// else { +// // order by number of hits +// Arrays.sort(ret, new Comparator() { +// public int compare(Suggestion suggestion, Suggestion suggestion1) { +// return new Integer(suggestion1.getCorpusQueryResults().size()).compareTo(suggestion.getCorpusQueryResults().size()); +// } +// }); +// } + } + return ret.toArray(new Suggestion[n < ret.size() ? n : ret.size()]); + } + + + public double getSuggestionSupressionThreadshold + () { + return suggestionSupressionThreadshold; + } + + public void setSuggestionSupressionThreadshold + ( + double suggestionSupressionThreadshold) { + this.suggestionSupressionThreadshold = suggestionSupressionThreadshold; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultAprioriCorpusFactory.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultAprioriCorpusFactory.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultAprioriCorpusFactory.java (revision 0) @@ -0,0 +1,65 @@ +package org.apache.lucene.search.didyoumean.impl; + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.index.facade.IndexWriterFacade; +import org.apache.lucene.search.didyoumean.AprioriCorpusFactory; +import org.apache.lucene.search.didyoumean.Suggester; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; +import org.apache.lucene.search.didyoumean.dictionary.SuggestionList; + +import java.io.IOException; +import java.util.*; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-19 + * Time: 02:13:00 + */ +public class DefaultAprioriCorpusFactory implements AprioriCorpusFactory { + + public void factory(Dictionary dictionary, Suggester suggester, IndexFacade aprioriIndex, String aprioriIndexField, Analyzer aprioriAnalyzer) throws IOException, DatabaseException { + // create an a priori index based on the inverted dictionary + + System.out.println("Inverting index..."); + Map inverted = dictionary.inverted(); + + IndexWriterFacade aprioriWriter = aprioriIndex.indexWriterFactory(aprioriAnalyzer, true); + +// int i=0; +// int i2=0; + System.out.println("Extracting most commonly misspelled words and phrases..."); + for (Map.Entry e : inverted.entrySet()) { + if (e.getValue().size() > 1) { + String suggested = suggester.didYouMean(dictionary, e.getKey()); + if (suggested != null && suggested.equalsIgnoreCase(e.getKey())) { + Document d = new Document(); + d.add(new Field(aprioriIndexField, e.getKey(), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS)); + aprioriWriter.addDocument(d); +// i2++; +// System.out.println(i + "\t" + i2); + } + } +// i++; + } + aprioriWriter.close(); + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/SecondLevelTokenPhraseSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/SecondLevelTokenPhraseSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/SecondLevelTokenPhraseSuggester.java (revision 0) @@ -0,0 +1,49 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.search.didyoumean.SecondLevelSuggester; +import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue; + +import java.io.IOException; + +/** + * Makes TokenPhraseSuggesterImpl a SecondLevelSuggester. + * + * todo: this is an ugly class. decoration? + * + * @author Karl Wettin + * Date: 2007-feb-17 + * Time: 08:03:01 + */ +public class SecondLevelTokenPhraseSuggester extends TokenPhraseSuggesterImpl implements SecondLevelSuggester { + + public SecondLevelTokenPhraseSuggester(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer phraseAnalyzer, IndexFacade aprioriIndex) throws IOException { + super(tokenSuggester, aprioriIndexField, defaultSuggestMorePopularTokensOnly, defaultMaxSuggestionsPerToken, phraseAnalyzer, aprioriIndex); + } + + public SuggestionPriorityQueue suggest(String query) { + return suggest(query, 1); + } + + + public boolean hasPersistableSuggestions() { + return true; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/MultiTokenSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/MultiTokenSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/MultiTokenSuggester.java (revision 0) @@ -0,0 +1,93 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.search.*; +import org.apache.lucene.search.didyoumean.SecondLevelSuggester; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Uses term queries rather than span near queries. + * + * @author karl wettin + * Date: 2007-okt-23 + * Time: 04:33:29 + */ +public class MultiTokenSuggester extends TokenPhraseSuggester implements SecondLevelSuggester { + + private IndexFacade aprioriIndex; + private IndexReader aprioriReader; + private IndexSearcher aprioriSearcher; + + + protected boolean isUpdateSuggestionsOrder() { + return false; + } + + public boolean hasPersistableSuggestions() { + return false; + } + + + /** + * + * @param tokenSuggester + * @param aprioriIndexField + * @param defaultSuggestMorePopularTokensOnly it makes sense setting this to true + * @param defaultMaxSuggestionsPerToken + * @param queryAnalyzer + * @param aprioriIndex + * @throws IOException + */ + public MultiTokenSuggester(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer queryAnalyzer, IndexFacade aprioriIndex) throws IOException { + super(tokenSuggester, aprioriIndexField, defaultSuggestMorePopularTokensOnly, defaultMaxSuggestionsPerToken, queryAnalyzer); + this.aprioriIndex = aprioriIndex; + this.aprioriReader = aprioriIndex.indexReaderFactory(); + this.aprioriSearcher = new IndexSearcher(aprioriReader); + } + + public SuggestionPriorityQueue suggest(String query) { + return suggest(query, 1); + } + + protected Query suggestionAprioriQueryFactory(Suggestion[] suggestions) { + BooleanQuery bq = new BooleanQuery(); + for (Suggestion suggestion : suggestions) { + bq.add(new TermQuery(new Term(getAprioriIndexField(), suggestion.getSuggested())), BooleanClause.Occur.MUST); + } + return bq; + } + + public IndexFacade getAprioriIndex() { + return aprioriIndex; + } + + protected IndexReader getAprioriReader() { + return aprioriReader; + } + + protected IndexSearcher getAprioriSearcher() { + return aprioriSearcher; + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TermEnumIterator.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TermEnumIterator.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TermEnumIterator.java (revision 0) @@ -0,0 +1,85 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token.ngram; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; + +import java.io.IOException; +import java.util.Iterator; + +/** + * @author Nicolas Maisonneuve + * @author Karl Wettin + * Date: 2007-feb-03 + * Time: 06:40:59 + */ +public class TermEnumIterator implements Iterator { + + private String field; + + private TermEnum termEnum; + private Term actualTerm; + private boolean hasNextCalled; + + public TermEnumIterator(IndexReader reader, String field) { + this.field = field.intern(); + try { + termEnum = reader.terms(new Term(field, "")); + } catch (IOException e) { + e.printStackTrace(); + } + } + + public String next() { + if (!hasNextCalled) { + hasNext(); + } + hasNextCalled = false; + return (actualTerm != null) ? actualTerm.text() : null; + } + + public boolean hasNext() { + if (hasNextCalled) { + return actualTerm != null; + } + hasNextCalled = true; + try { + // if there are no more words + if (!termEnum.next()) { + actualTerm = null; + return false; + } + // if the next word is in the field + actualTerm = termEnum.term(); + String currentField = actualTerm.field(); + if (currentField != field) { + actualTerm = null; + return false; + } + return true; + } catch (IOException e) { + e.printStackTrace(); + return false; + } + } + + public void remove() { + throw new UnsupportedOperationException(); + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/NgramTokenSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/NgramTokenSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/NgramTokenSuggester.java (revision 0) @@ -0,0 +1,372 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token.ngram; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.analysis.standard.StandardAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.document.Field; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.facade.IndexWriterFacade; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.*; +import org.apache.lucene.search.didyoumean.EditDistance; +import org.apache.lucene.search.didyoumean.Levenshtein; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue; +import org.apache.lucene.search.didyoumean.secondlevel.token.TokenSuggester; +import org.apache.lucene.search.didyoumean.secondlevel.token.TokenSuggestion; + +import java.io.IOException; +import java.util.Collections; +import java.util.HashSet; +import java.util.Iterator; +import java.util.Set; + +/** + * A single token word suggester based on n-grams. + *

+ * Uses Lucene for persistency and token n-gram matching. + * + * @author initially inspired by the David Spencer code. + * @author Nicolas Maisonneuve + * @author Karl Wettin + * Date: 2007-feb-03 + * Time: 05:29:56 + */ +public class NgramTokenSuggester implements TokenSuggester { + + private IndexFacade ngramIndex; + private IndexReader ngramReader; + private IndexSearcher ngramSearcher; + + public NgramTokenSuggester(IndexFacade ngramIndex) throws IOException { + this.ngramIndex = ngramIndex; + ngramReader = ngramIndex.indexReaderFactory(); + ngramSearcher = new IndexSearcher(ngramReader); + } + + + public IndexReader getNgramReader() { + return ngramReader; + } + + public IndexSearcher getNgramSearcher() { + return ngramSearcher; + } + + public IndexFacade getNgramIndex() { + return ngramIndex; + } + + private int defaultHitEnumerationsPerSuggestion = 10; + + /** + * Field name for each word in the ngram index. + */ + public static final String F_WORD = "word"; + + /** + * Boost value for start and end grams + */ + private float bStart = 2.0f; + private float bEnd = 1.0f; + + // minimum score for hits generated by the spell checker query + private float minScore = 0.5f; + + /** + * Sets the accuracy 0 < minScore < 1; default 0.5 + */ + public void setAccuracy(float min) { + this.minScore = min; + } + + private boolean defaultSuggestSelf = false; + private boolean defaultSuggestOnlyMorePopularTokens = false; + + public SuggestionPriorityQueue suggest(String queryToken, int n) throws IOException { + return suggest(queryToken, n, defaultSuggestSelf, null, null, false, getDefaultHitEnumerationsPerSuggestion()); + } + + public SuggestionPriorityQueue suggest(String queryToken, int n, boolean suggestSelf, IndexReader aprioriIndexReader, String aprioriIndexField, boolean selectMorePopularTokensOnly) throws IOException { + return suggest(queryToken, n, suggestSelf, aprioriIndexReader, aprioriIndexField, selectMorePopularTokensOnly, getDefaultHitEnumerationsPerSuggestion()); + } + + /** + * Suggest similar words (restricted or not to a field of a user index) + * + * @param queryToken the word you want a spell check done on + * @param maxSuggestions the number of suggest words + * @param suggestSelf if true, a suggestion can be the queried token. + * @param aprioriIndexReader + * @param aprioriIndexField the field of the user index: if field is not null, the suggested + * words are restricted to the words present in this field. + * @param suggestOnyMorePopularTokens if true, suggest only tokens that are more frequent than the query token + * (only if restricted mode = (aprioriIndex!=null and aprioriIndexField!=null) + * @param hitEnumerationsPerSuggestion number of ngram document to measure edit distance on for each number of expected returned suggestions. @return suggestions the query token + * @throws IOException if something went wrong in either aprioriIndex or ngramIndex. + */ + public SuggestionPriorityQueue suggest(String queryToken, int maxSuggestions, boolean suggestSelf, IndexReader aprioriIndexReader, + String aprioriIndexField, boolean suggestOnyMorePopularTokens, int hitEnumerationsPerSuggestion) throws IOException { + + SuggestionPriorityQueue queue = new SuggestionPriorityQueue(maxSuggestions); + + float minScore = this.minScore; + final EditDistance editDistance = editDistanceFactory(queryToken); + final int tokenLength = queryToken.length(); + + final int goalFreq = (suggestOnyMorePopularTokens && aprioriIndexReader != null) ? aprioriIndexReader.docFreq(new Term(aprioriIndexField, queryToken)) : 0; + // if the word exists in the real index and we don't care for word frequency, return the word itself + if (!suggestOnyMorePopularTokens && goalFreq > 0) { + queue.insert(new Suggestion(queryToken)); + return queue; + } + + BooleanQuery query = new BooleanQuery(); + String[] grams; + String key; + + for (int ng = getMin(tokenLength); ng <= getMax(tokenLength); ng++) { + + key = "gram" + ng; // form key + + grams = formGrams(queryToken, ng); // form word into ngrams (allow dups too) + + if (grams.length == 0) { + continue; // hmm + } + + if (bStart > 0) { // should we boost prefixes? + add(query, "start" + ng, grams[0], bStart); // matches start of word + + } + if (bEnd > 0) { // should we boost suffixes + add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word + + } + for (String gram : grams) { + add(query, key, gram); + } + } + +// System.out.println("Q: " + query); + Hits hits = ngramSearcher.search(query); +// System.out.println("HITS: " + hits.length()); + + // go thru more than 'maxr' matches in case the distance filter triggers + TokenSuggestion suggestion = new TokenSuggestion(); + int stop = maxSuggestions * hitEnumerationsPerSuggestion; + for (int currentHit = 0; currentHit < hits.length() && currentHit < stop; currentHit++) { + suggestion.setSuggested(hits.doc(currentHit).get(F_WORD)); // get orig word + + // don't suggest a word for itself, that would be silly + if (!suggestSelf && suggestion.getSuggested().equals(queryToken)) { + continue; + } + + // edit distance/normalize with the minScore word length + suggestion.setScore(1.0d - ((double) editDistance.getDistance(suggestion.getSuggested()) / Math + .min(suggestion.getSuggested().length(), tokenLength))); + if (suggestion.getScore() < minScore) { + continue; + } + + if (aprioriIndexReader != null) { // use the user index + suggestion.setFrequency(aprioriIndexReader.docFreq(new Term(aprioriIndexField, suggestion.getSuggested()))); // freq in the index + // don't suggest a word that is not present in the field + if ((suggestOnyMorePopularTokens && goalFreq > suggestion.getFrequency()) || suggestion.getFrequency() < 1) { + continue; + } + } + queue.insert(suggestion); + + + suggestion = new TokenSuggestion(); + } + + return queue; + } + + /** + * Add a clause to a boolean query. + */ + private void add(BooleanQuery q, String name, String value, float boost) { + Query tq = new TermQuery(new Term(name, value)); + tq.setBoost(boost); + q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD)); + } + + /** + * Add a clause to a boolean query. + */ + private void add(BooleanQuery q, String name, String value) { + q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD)); + } + + /** + * Form all ngrams for a given word. + * + * @param text the word to parse + * @param ng the ngram length e.g. 3 + * @return an array of all ngrams in the word and note that duplicates are not removed + */ + private String[] formGrams(String text, int ng) { + int len = text.length(); + String[] res = new String[len - ng + 1]; + for (int i = 0; i < len - ng + 1; i++) { + res[i] = text.substring(i, i + ng); + } + return res; + } + + /** + * Index a Dictionary + * + * @param tokens the dictionary to index + * @throws IOException + */ + public void indexDictionary(Iterator tokens) throws IOException { + indexDictionary(tokens, 3); + } + + /** + * Index a Dictionary + * + * @param tokens the dictionary to index + * @param minTokenLength minimum size of token to be suggestable. 2 if you want "on" to suggest "in". + * @throws IOException + */ + public void indexDictionary(Iterator tokens, int minTokenLength) throws IOException { + if (minTokenLength < 2) { + minTokenLength = 2; + } + IndexWriterFacade writer = ngramIndex.indexWriterFactory(new StandardAnalyzer(Collections.EMPTY_SET), false); + //writer.setMergeFactor(300); + //writer.setMaxBufferedDocs(150); + + Set unflushedTokens = new HashSet(1000); + + while (tokens.hasNext()) { + String token = tokens.next(); + + int len = token.length(); + if (len < minTokenLength) { + continue; // too short we bail but "too long" is fine... + } + + if (unflushedTokens.contains(token) || ngramReader.docFreq(new Term(F_WORD, token)) > 0) { + continue; + } + + // ok index the word + Document doc = createDocument(token, getMin(len), getMax(len)); + writer.addDocument(doc); + unflushedTokens.add(token); + + } + + writer.optimize(); + writer.close(); + + IndexSearcher oldSearcher = ngramSearcher; + IndexReader oldReader = ngramReader; + + ngramReader = ngramIndex.indexReaderFactory(); + ngramSearcher = new IndexSearcher(ngramReader); + + oldSearcher.close(); + oldReader.close(); + + } + + private int getMin(int l) { + if (l > 5) { + return 3; + } + if (l == 5) { + return 2; + } + return 1; + } + + private int getMax(int l) { + if (l > 5) { + return 4; + } + if (l == 5) { + return 3; + } + return 2; + } + + private Document createDocument(String text, int ng1, int ng2) { + Document doc = new Document(); + doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term + addGram(text, doc, ng1, ng2); + return doc; + } + + private void addGram(String text, Document doc, int ng1, int ng2) { + int len = text.length(); + for (int ng = ng1; ng <= ng2; ng++) { + String key = "gram" + ng; + String end = null; + for (int i = 0; i < len - ng + 1; i++) { + String gram = text.substring(i, i + ng); + doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); + if (i == 0) { + doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED)); + } + end = gram; + } + if (end != null) { // may not be present if len==ng1 + doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED)); + } + } + } + + + public boolean isDefaultSuggestSelf() { + return defaultSuggestSelf; + } + + public void setDefaultSuggestSelf(boolean defaultSuggestSelf) { + this.defaultSuggestSelf = defaultSuggestSelf; + } + + public boolean isDefaultSuggestOnlyMorePopularTokens() { + return defaultSuggestOnlyMorePopularTokens; + } + + public void setDefaultSuggestOnlyMorePopularTokens(boolean defaultSuggestOnlyMorePopularTokens) { + this.defaultSuggestOnlyMorePopularTokens = defaultSuggestOnlyMorePopularTokens; + } + + + public int getDefaultHitEnumerationsPerSuggestion() { + return defaultHitEnumerationsPerSuggestion; + } + + public void setDefaultHitEnumerationsPerSuggestion(int defaultHitEnumerationsPerSuggestion) { + this.defaultHitEnumerationsPerSuggestion = defaultHitEnumerationsPerSuggestion; + } + + + public EditDistance editDistanceFactory(String sa) { + return new Levenshtein(sa); + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggestion.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggestion.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggestion.java (revision 0) @@ -0,0 +1,77 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.search.didyoumean.Suggestion; + +import java.io.Serializable; + +/** + * @author Karl Wettin + * Date: 2007-feb-03 + * Time: 05:43:03 + */ +public class TokenSuggestion extends Suggestion implements Serializable { + + // private static Log log = LogFactory.getLog(SingleTokenSuggestion.class); + private static long serialVersionUID = 1l; + + private int frequency; + + + public TokenSuggestion() { + } + + public TokenSuggestion(String suggested, double score, int frequency) { + super(suggested, score); + this.frequency = frequency; + } + + + public int getFrequency() { + return frequency; + } + + public void setFrequency(int frequency) { + this.frequency = frequency; + } + + + public int compareTo(Suggestion suggestion) { + if (suggestion instanceof TokenSuggestion) { + TokenSuggestion tokenSuggestion = (TokenSuggestion) suggestion; + // first criteria: the edit distance + if (getScore() > tokenSuggestion.getScore()) { + return 1; + } + if (getScore() < tokenSuggestion.getScore()) { + return -1; + } + + // second criteria (if first criteria is equal): the popularity + if (getFrequency() > tokenSuggestion.getFrequency()) { + return 1; + } + + if (getFrequency() < tokenSuggestion.getFrequency()) { + return -1; + } + return 0; + } else { + return super.compareTo(suggestion); + } + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggester.java (revision 0) @@ -0,0 +1,33 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue; + +import java.io.IOException; + + +/** + * A suggester that can only handle single token words. + * + * @author Karl Wettin + * Date: 2007-jan-30 + * Time: 06:05:54 + */ +public interface TokenSuggester { + public abstract SuggestionPriorityQueue suggest(String queryToken, int n, boolean suggestSelf, IndexReader aprioriIndexReader, String aprioriIndexField, boolean selectMorePopularTokensOnly) throws IOException; +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggester.java (revision 0) @@ -0,0 +1,448 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.Token; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.TermFreqVector; +import org.apache.lucene.index.TermPositionVector; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.search.Hits; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue; + +import java.io.IOException; +import java.io.StringReader; +import java.util.*; + +/** + * A layer on top of the single token suggesting {@link TokenSuggester} that enables muti token (phrase) suggestions. + *

+ * Pretty much the same thing as a SpanFuzzyQuery. + *

+ * Places a matrix of {@link org.apache.lucene.search.spans.SpanNearQuery} to find valid suggestions. If any of the + * valid hits contains a {@link org.apache.lucene.index.TermPositionVector}, it will be analyzed and suggest the query + * in the order of terms in the index. + * todo: if term positions available and stored, suggest that for cosmetic reasons in case of stemming et c.) + * todo: E.g. query "camel that broke the staw" is suggested with "straw that broke the camel" + * + * + * @author Karl Wettin + * Date: 2007-jan-30 + * Time: 05:24:38 + */ +public abstract class TokenPhraseSuggester { + + private TokenSuggester tokenSuggester; + + /** + * this is a nasty hack due to MultiTokenSuggester using this code. + * @return true if this implementation allows inspection of term vector to change order of query + */ + protected abstract boolean isUpdateSuggestionsOrder(); + + /** + * @return the index used for checking that suggestions match something, inspecting term position vector- and term frequency inspection. + * @see TokenPhraseSuggester#getAprioriIndexField() + */ + protected abstract IndexFacade getAprioriIndex(); + + protected abstract IndexReader getAprioriReader(); + + protected abstract IndexSearcher getAprioriSearcher(); + + /** + * @param suggestions suggestion to be made in to a query + * @return a query that much match in order for the suggestion to be suggested + */ + protected abstract Query suggestionAprioriQueryFactory(Suggestion[] suggestions); + + /** + * if true, the ngram suggester will only suggest words more frequent than the query word + */ + private boolean defaultSuggestMorePopularTokensOnly; + + /** + * used by the single token ngram spell checker when suggestMorePopularTokensOnly is true + * in the apriori index reader to check frequency. + */ + private String aprioriIndexField; + + + /** + * number of suggestion per token in phrase. + * there will be (n^tokens in phrase) queries placed on the index to find the best suggestion. + * e.g. "three token phrase" and n=5 might results in 243 queries on the apriori index. + */ + private int defaultMaxSuggestionsPerToken = 3; + private Analyzer queryAnalyzer; + + /** + * @param tokenSuggester the single token suggester that backs this phrase suggester + * @param aprioriIndexField the document field used for term frequency inspection at token suggestion level. + * @param defaultSuggestMorePopularTokensOnly + * if true, at token suggestion level, only suggest tokens that are more popular than the one in the original query. + * @param defaultMaxSuggestionsPerToken number of suggestion per token in phrase. there will be (n^tokens in phrase) queries placed on the index to find the best suggestion. e.g. "three token phrase" and n=5 might results in 243 queries on the apriori index. + * @param queryAnalyzer the analyzer used to tokenize phrases + * @throws java.io.IOException if something goes wrong in either the ngram spell checker or in the apriori index + */ + public TokenPhraseSuggester(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer queryAnalyzer) throws IOException { + this.tokenSuggester = tokenSuggester; + this.aprioriIndexField = aprioriIndexField; + this.defaultSuggestMorePopularTokensOnly = defaultSuggestMorePopularTokensOnly; + this.defaultMaxSuggestionsPerToken = defaultMaxSuggestionsPerToken; + this.queryAnalyzer = queryAnalyzer; + } + + public String didYouMean(String query) { + SuggestionPriorityQueue suggestions = suggest(query, 1); + if (suggestions.size() > 0) { + return ((Suggestion) suggestions.top()).getSuggested(); + } else { + return null; + } + } + + /** + * If this returns true, + * then didYouMean() will place this phrase as a query against the apriori index + * to see if it is a good suggestion or not. + * + * @param phrase the suggested phrase + * @param query the user input that requests the suggestion + * @return true if the phrase is known to the sub class + */ + protected boolean isSuggestablePhrase(String phrase, String query) { + return true; + } + + + public boolean suggestionArraysEquals(Suggestion[] suggestions, Suggestion[] suggestions1) { + if (suggestions == suggestions1) { + return true; + } + if (suggestions == null /*&& suggestions1 != null*/) { + return false; + } + if (/*suggestions != null &&*/ suggestions1 == null) { + return false; + } + if (suggestions.length != suggestions1.length) { + return false; + } + for (int i = 0; i < suggestions.length; i++) { + if (!suggestions[i].getSuggested().equals(suggestions1[i].getSuggested())) { + return false; + } + } + return true; + } + + + /** + * @param query user input + * @param maxSuggestions will not return more than this many suggestions + * @return suggestions found, in natural order + */ + public SuggestionPriorityQueue suggest(String query, int maxSuggestions) { + return suggest(query, maxSuggestions, getDefaultMaxSuggestionsPerToken(), isDefaultSuggestMorePopularTokensOnly()); + } + + /** + * @param query user input + * @param maxSuggestions limits results to this many suggestions + * @param maxSuggestionsPerToken tokens in the phrase will contain max this many suggestions + * @param suggestMorePopularTokensOnly true if the token suggester should suggest only more popular terms. not recommended. + * @return suggestions found, in natural order + */ + public SuggestionPriorityQueue suggest(String query, int maxSuggestions, int maxSuggestionsPerToken, boolean suggestMorePopularTokensOnly) { + + long ms = System.currentTimeMillis(); + + // build a matrix with all suggestions per token in query. + + final List matrix = new LinkedList(); + + TokenStream ts = getQueryAnalyzer().tokenStream(null, new StringReader(query)); + try { + Token token; + while ((token = ts.next()) != null) { + try { + SuggestionPriorityQueue suggestions = tokenSuggester.suggest(token.termText(), maxSuggestionsPerToken, true, getAprioriReader(), getAprioriIndexField(), suggestMorePopularTokensOnly); + if (suggestions.size() == 0) { + suggestions.insert(new Suggestion(token.termText())); + } + matrix.add(suggestions.toArray()); + + } catch (IOException ioe) { + throw new RuntimeException("Exception caught while looking for a suggestion to " + query, ioe); + } + } + } catch (IOException ioe) { + throw new RuntimeException("Error tokenizing " + query, ioe); + } + + /* + * iterate through all possible combinations of suggetions in the matrix + *

+    * the best game
+    * tho rest fame
+    *          lame
+    *
+    * tho best game
+    * tho best fame
+    * tho best lame
+    * tho rest game
+    * tho rest fame
+    * tho rest lame
+    * the best game
+    * the best fame
+    * the best lame
+    * the rest game
+    * the rest fame
+    * the rest lame
+    * 
+ */ + Iterator itAllCombinations = new Iterator() { + private int[] counter = new int[matrix.size()]; + + + public void remove() { + throw new IllegalStateException("not implemented"); + } + + public boolean hasNext() { + int s = counter.length; + return counter[s - 1] < matrix.get(s - 1).length; + } + + public Suggestion[] next() { + if (!hasNext()) { + throw new NoSuchElementException("no more elements"); + } + Suggestion[] ls = new Suggestion[counter.length]; + for (int i = 0; i < counter.length; i++) { + ls[i] = (matrix.get(i)[counter[i]]); + } + incrementCounter(); + return ls; + } + + private void incrementCounter() { + for (int i = 0; i < counter.length; i++) { + counter[i]++; + if (counter[i] == matrix.get(i).length && + i < counter.length - 1) { + counter[i] = 0; + } else { + break; + } + } + } + }; + + SuggestionPriorityQueue queue = new SuggestionPriorityQueue(maxSuggestionsPerToken); + + + int queryCounter = 0; + int currentHit = 0; + Hits hits = null; + Suggestion[] suggestions = null; + while (hits != null || itAllCombinations.hasNext()) { + Integer corpusQueryResults = null; + + if (hits == null) { + suggestions = itAllCombinations.next(); + queryCounter++; + + StringBuilder sb = new StringBuilder(10 * matrix.size()); + for (Suggestion suggestion : suggestions) { + sb.append(suggestion.getSuggested()); + sb.append(" "); + } + sb.deleteCharAt(sb.length() - 1); + String phrase = sb.toString(); + + //System.out.println(cnt + "\t" + phrase); + + if (isSuggestablePhrase(phrase, query)) { + Query nextQuery = suggestionAprioriQueryFactory(suggestions); + try { + + hits = getAprioriSearcher().search(nextQuery); + corpusQueryResults = hits.length(); + if (hits.length() == 0) { + hits = null; + continue; + } else { + currentHit = 0; + } + } catch (IOException ioe) { + throw new RuntimeException("Exception caught while searching for " + nextQuery.toString(), ioe); + } + } else { + continue; + } + } else { + currentHit++; + } + + + if (isUpdateSuggestionsOrder()) { + + // todo refactor, this is a nasty hack due to MultiTokenSuggester using this code. + + + if (currentHit < hits.length()) { + + // attempt to figure out the order of the tokens in the phrase + + try { + TermFreqVector termFreqVector = getAprioriReader().getTermFreqVector(hits.id(currentHit), getAprioriIndexField()); + if (termFreqVector != null && termFreqVector instanceof TermPositionVector) { + TermPositionVector termPosVector = (TermPositionVector) termFreqVector; + final int[][] suggestionPositions = new int[suggestions.length][]; + + for (int i = 0; i < suggestionPositions.length; i++) { + int termIndex = Arrays.binarySearch(termFreqVector.getTerms(), suggestions[i].getSuggested()); + suggestionPositions[i] = termPosVector.getTermPositions(termIndex); + } + + // todo: if pos vectors and stored data, then extract the suggestion as stored. +// TermVectorOffsetInfo[] offsets = termPosVector.getOffsets(termIndex); +// if (offsets != null) { +// suggestionOffsets[i][0] = i; +// suggestionOffsets[i][1] = i; +// } + + // todo: find the actual position area that matched, rather than just looking at the first occurances. + // todo: at least optimze! + Map positions = new HashMap(suggestions.length); + for (int i = 0; i < suggestions.length; i++) { + positions.put(suggestions[i], suggestionPositions[i][0]); + } + Map.Entry[] ordered = positions.entrySet().toArray((Map.Entry[]) new Map.Entry[0]); + Arrays.sort(ordered, new Comparator>() { + + public int compare(Map.Entry entry, Map.Entry entry1) { + return entry.getValue().compareTo(entry1.getValue()); + } + }); + for (int i = 0; i < ordered.length; i++) { + suggestions[i] = ordered[i].getKey(); + } + + // token order found + + } else { + // to term vector. look for one in next hit. + // todo some setting that skip this when score is lower than n or current hit greather than n. + if (hits.length() - 1 == currentHit) { + // fall back on user input + } else { + continue; + } + } + } catch (IOException ioe) { + //throw new RuntimeException("Exception caught when inspecting the term position vectors.", ioe); + } + } + } + + StringBuilder stringBuilder = new StringBuilder(matrix.size() * 10); + for (Suggestion suggestion : suggestions) { + stringBuilder.append(suggestion.getSuggested()); + stringBuilder.append(' '); + } + stringBuilder.deleteCharAt(stringBuilder.length() - 1); + + String suggested = stringBuilder.toString(); + + double score = 0; + for (Suggestion suggestion : suggestions) { + score += suggestion.getScore(); + } + score *= hits.length(); + + queue.insert(new Suggestion(suggested, score, corpusQueryResults)); + + hits = null; + + } + + System.out.println("Built matrix with " + queryCounter + " queries and executed them in " + (System.currentTimeMillis() - ms) + " milliseconds."); + + return queue; + } + + + public TokenSuggester getTokenSuggester() { + return tokenSuggester; + } + + public void setTokenSuggester(TokenSuggester tokenSuggester) { + this.tokenSuggester = tokenSuggester; + } + + /** + * @return the document field used for term frequency inspection at token suggestion level. + */ + public String getAprioriIndexField() { + return aprioriIndexField; + } + + /** + * @param aprioriIndexField the document field used for term frequency inspection at token suggestion level. + */ + public void setAprioriIndexField(String aprioriIndexField) { + this.aprioriIndexField = aprioriIndexField; + } + + /** + * // todo: consider if false will allow for "lost on translation" to suggest "lost in translation". + * // todo: will single token phrases then start suggesting a lot of bad things? + * + * @return if true, at token suggestion level, only suggest tokens that are more popular than the one in the original query. + */ + public boolean isDefaultSuggestMorePopularTokensOnly() { + return defaultSuggestMorePopularTokensOnly; + } + + public void setDefaultSuggestMorePopularTokensOnly(boolean defaultSuggestMorePopularTokensOnly) { + this.defaultSuggestMorePopularTokensOnly = defaultSuggestMorePopularTokensOnly; + } + + + public int getDefaultMaxSuggestionsPerToken() { + return defaultMaxSuggestionsPerToken; + } + + public void setDefaultMaxSuggestionsPerToken(int defaultMaxSuggestionsPerToken) { + this.defaultMaxSuggestionsPerToken = defaultMaxSuggestionsPerToken; + } + + public Analyzer getQueryAnalyzer() { + return queryAnalyzer; + } + + public void setQueryAnalyzer(Analyzer queryAnalyzer) { + this.queryAnalyzer = queryAnalyzer; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggesterImpl.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggesterImpl.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggesterImpl.java (revision 0) @@ -0,0 +1,71 @@ +package org.apache.lucene.search.didyoumean.secondlevel.token; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.spans.SpanNearQuery; +import org.apache.lucene.search.spans.SpanTermQuery; + +import java.io.IOException; + +/** + * @author Karl Wettin + * Date: 2007-feb-03 + * Time: 08:11:34 + */ +public class TokenPhraseSuggesterImpl extends TokenPhraseSuggester { + + private IndexFacade aprioriIndex; + private IndexReader aprioriReader; + private IndexSearcher aprioriSearcher; + + + protected boolean isUpdateSuggestionsOrder() { + return true; + } + + public TokenPhraseSuggesterImpl(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer phraseAnalyzer, IndexFacade aprioriIndex) throws IOException { + super(tokenSuggester, aprioriIndexField, defaultSuggestMorePopularTokensOnly, defaultMaxSuggestionsPerToken, phraseAnalyzer); + this.aprioriIndex = aprioriIndex; + this.aprioriReader = aprioriIndex.indexReaderFactory(); + this.aprioriSearcher = new IndexSearcher(aprioriReader); + } + + protected Query suggestionAprioriQueryFactory(Suggestion[] suggestions) { + SpanTermQuery[] clauses = new SpanTermQuery[suggestions.length]; + for (int i = 0; i < suggestions.length; i++) { + clauses[i] = new SpanTermQuery(new Term(getAprioriIndexField(), suggestions[i].getSuggested())); + } + return new SpanNearQuery(clauses, 5, false); + } + + public IndexFacade getAprioriIndex() { + return aprioriIndex; + } + + protected IndexReader getAprioriReader() { + return aprioriReader; + } + + protected IndexSearcher getAprioriSearcher() { + return aprioriSearcher; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/package.html =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/package.html (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/package.html (revision 0) @@ -0,0 +1,13 @@ + + + + + + + + +This package needs refactoring. MultiTokenSuggester and TokenPhraseSuggester should extend the same abstract class. + + + + \ No newline at end of file Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalTreeExtractor.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalTreeExtractor.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalTreeExtractor.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import java.util.List; + +/** + * A user session could contain multiple quests for content. For example, + * first the user looks for the Apache licence, + * spells it wrong, inspects different results, + * and then the user search for the author Ivan Goncharov. + *

+ * In the session analyzing suggester, this is called different goals. + *

+ * It is up to the QueryGoalTreeExtractor implementations to decide what + * events in a session are parts of the same goal, as we don't want to + * suggest the user to check out Goncharov when looking for the Apache license. + * + * @author Karl Wettin + * Date: Aug 1, 2006 + * Time: 2:33:47 PM + */ +public interface QueryGoalTreeExtractor { + + public abstract List> extractGoalRoots(QueryGoalNode session); +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Levenshtein.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Levenshtein.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Levenshtein.java (revision 0) @@ -0,0 +1,125 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Levenshtein edit distance. + * http://en.wikipedia.org/wiki/Levenshtein_distance + *

+ * todo: this is currenly just a copy of the TRStringDistance, + * todo: the thought is to make this in to an interface that TRStringDistance implements + * todo: but that requires a bit of refactoring. + */ +public final class Levenshtein extends EditDistance { + + private final int n; + private final int[][][] cache = new int[30][][]; + + + /** + * Optimized to run a bit faster than the static getDistance(). + * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster. + */ + public Levenshtein(String sa) { + super(sa); + n = sa.length(); + } + + + public final int getDistance(String query) { + int d[][]; // matrix + int cost; // cost + + // Step 1 + final char[] ta = query.toCharArray(); + final int m = ta.length; + if (n == 0) { + return m; + } + if (m == 0) { + return n; + } + + if (m >= cache.length) { + d = form(n, m); + } else if (cache[m] != null) { + d = cache[m]; + } else { + d = cache[m] = form(n, m); + + // Step 3 + + } + for (int i = 1; i <= n; i++) { + final char s_i = sa[i - 1]; + + // Step 4 + + for (int j = 1; j <= m; j++) { + final char t_j = ta[j - 1]; + + // Step 5 + + if (s_i == t_j) { // same + cost = 0; + } else { // not a match + cost = 1; + + // Step 6 + + } + d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost); + } + } + + // Step 7 + return d[n][m]; + + } + + + private static int[][] form(int n, int m) { + int[][] d = new int[n + 1][m + 1]; + // Step 2 + + for (int i = 0; i <= n; i++) { + d[i][0] = i; + + } + for (int j = 0; j <= m; j++) { + d[0][j] = j; + } + return d; + } + + + //**************************** + // Get minimum of three values + //**************************** + private static int min3(int a, int b, int c) { + int mi = a; + if (b < mi) { + mi = b; + } + if (c < mi) { + mi = c; + } + return mi; + + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/SuggestionList.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/SuggestionList.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/SuggestionList.java (revision 0) @@ -0,0 +1,131 @@ +package org.apache.lucene.search.didyoumean.dictionary; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import com.sleepycat.persist.model.Entity; +import com.sleepycat.persist.model.PrimaryKey; + +import java.io.Serializable; +import java.util.Collections; +import java.util.Iterator; +import java.util.LinkedList; +import java.util.List; + +import org.apache.lucene.search.didyoumean.Suggestion; + +/** + * A list of suggetions to a miss spelled word, ordered by suggestion score. + * + * @author Karl Wettin + * Date: 2007-feb-02 + * Time: 06:28:23 + *

+ */ +@Entity +public class SuggestionList implements Iterable, Serializable { + + // private static Log log = LogFactory.getLog(SuggestionsList.class); + private static long serialVersionUID = 1l; + + @PrimaryKey + private String query; + private List suggestions = new LinkedList(); + + /** bdb persistency */ + private SuggestionList(){} + + SuggestionList(String query) { + this.query = query; + } + + public Suggestion get(int index) { + return suggestions.get(index); + } + + public Suggestion get(String suggested) { + for (Suggestion suggestion : suggestions) { + if (suggestion.getSuggested().equals(suggested)) { + return suggestion; + } + } + return null; + } + + public Iterator iterator() { + return suggestions.iterator(); + } + + public int size() { + return suggestions.size(); + } + + public boolean containsSuggested(String suggested) { + for (Suggestion suggestion : suggestions) { + if (suggested.equals(suggestion.getSuggested())) { + return true; + } + } + return false; + } + + public void addSuggested(String suggested, double score, Integer corpusQueryResults) { + if (containsSuggested(suggested)) { + throw new IllegalArgumentException("Already contains suggested '" + suggested + "'"); + } + Suggestion suggestion = new Suggestion(suggested, score, corpusQueryResults); + int index = Collections.binarySearch(suggestions, suggestion); + if (index < 0) { + index = (index * -1) - 1; + } + suggestions.add(index, suggestion); + } + + public static interface SuggestionFilter { + public abstract boolean accept(Suggestion suggestion); + } + + public Suggestion[] toArray(Suggestion[] arr) { + return suggestions.toArray(arr); + } + + public List filter(SuggestionFilter filter) { + List list = new LinkedList(); + filter(list, filter); + return list; + } + + public void filter(List list, SuggestionFilter filter) { + for (Suggestion s : this.suggestions) { + if (filter.accept(s)) { + list.add(s); + } + } + } + + public void sort() { + Collections.sort(suggestions); + } + + public String getQuery() { + return query; + } + + + List getSuggestions() { + return suggestions; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/Dictionary.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/Dictionary.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/Dictionary.java (revision 0) @@ -0,0 +1,235 @@ +package org.apache.lucene.search.didyoumean.dictionary; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import com.sleepycat.je.DatabaseException; +import com.sleepycat.je.Environment; +import com.sleepycat.je.EnvironmentConfig; +import com.sleepycat.persist.EntityCursor; +import com.sleepycat.persist.EntityStore; +import com.sleepycat.persist.PrimaryIndex; +import com.sleepycat.persist.StoreConfig; +import org.apache.lucene.search.didyoumean.SecondLevelSuggester; +import org.apache.lucene.search.didyoumean.Suggester; +import org.apache.lucene.search.didyoumean.Suggestion; +import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue; + +import java.io.File; +import java.util.Arrays; +import java.util.Comparator; +import java.util.HashMap; +import java.util.Map; + +/** + * Essentially a dictionary with a bunch of weighted suggestions, created by a {@link org.apache.lucene.search.didyoumean.Trainer} + * and navigated by a {@link org.apache.lucene.search.didyoumean.Suggester}. + *

+ * If the dictionary does not contain a suggestion for a given query, it will be passed on to any available instance + * of {@link org.apache.lucene.search.didyoumean.SecondLevelSuggester} that hopefully will comes up with a suggestion. + * + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 2:10:19 PM + */ +public class Dictionary { + + private EntityStore store; + private PrimaryIndex suggestionsByQuery; + + + public Dictionary(File bdbPath) throws DatabaseException { + if (!bdbPath.exists()) { + System.out.println("Creating path " + bdbPath); + bdbPath.mkdirs(); + } + + EnvironmentConfig envConfig = new EnvironmentConfig(); + envConfig.setAllowCreate(true); + envConfig.setTransactional(false); + envConfig.setLocking(false); + Environment env = new Environment(bdbPath, envConfig); + + StoreConfig storeConfig = new StoreConfig(); + storeConfig.setAllowCreate(true); + storeConfig.setTransactional(false); + store = new EntityStore(env, "didYouMean", storeConfig); + + suggestionsByQuery = store.getPrimaryIndex(String.class, SuggestionList.class); + } + + public SuggestionList suggestionListFactory(String query) { + return new SuggestionList(keyFormatter(query)); + } + + public void close() throws DatabaseException { + store.close(); + } + + /** + * This function allows modification of the dictionary keys. Default implementation stripps them from all + * whitespace and punctuation, enabling features such as "allwork and no fun" suggesting "all work and no fun" + * That will of course require that the trainer have inserted the key to suggest to it's original self. Training + * everything the users type in might consume a lot of resources. + * {@link org.apache.lucene.search.didyoumean.impl.DefaultTrainer#setTrainingFinalGoalAsSelf(boolean)} ()} + * + * @param inputKey dictionary key to be formatted + * @return inputKey stripped from all whitespace and punctuation. + */ + public String keyFormatter(String inputKey) { + return inputKey.replaceAll("\\p{Punct}", "").replaceAll("\\s", "").toLowerCase(); + } + + /** + * Implementation must pass query through keyformatter! + * @param query unformatted key + * @return suggestion list associated with key + */ + public SuggestionList getSuggestions(String query) throws DatabaseException { + return suggestionsByQuery.get(keyFormatter(query)); + } + + /** + * This is used by the second level cache so it only comes up with suggestions that don't exist in the dictionary. + * + * @param query the original query from the user + * @param suggested the suggestion to the query to inspect if suggestable + * @return true if the suggested is a known suggetion to the query. + */ + public boolean isExistingSuggestion(String query, String suggested) throws DatabaseException { + SuggestionList suggestions = getSuggestions(query); + return suggestions != null && suggestions.containsSuggested(suggested); + } + + + private Map prioritesBySecondLevelSuggester = new HashMap(); + + /** + * Comes up with the best suggestion from the second level suggesters. + * {@link org.apache.lucene.search.didyoumean.SecondLevelSuggester} + *

+ * This metod also adds the suggestion to the dictionary! + * + * @param query the user input + * @param n number of suggestions requested + * @return the best suggestion the second level suggesters could come up with + */ + public Suggestion[] getSecondLevelSuggestion(String query, int n) throws DatabaseException { + Map suggestionsBySuggested = new HashMap(); + for (Map.Entry suggester_boost : prioritesBySecondLevelSuggester.entrySet()) { + SuggestionPriorityQueue suggestions = suggester_boost.getKey().suggest(query); + for (int i = 0; i < n && suggestions.size() > i; i++) { + Suggestion suggestion = (Suggestion) suggestions.pop(); + if (suggester_boost.getKey().hasPersistableSuggestions()) { + Suggestion internedSuggestion = suggestionsBySuggested.get(suggestion.getSuggested()); + if (internedSuggestion == null) { + internedSuggestion = new Suggestion(suggestion.getSuggested(), 0d); + } + internedSuggestion.setScore(internedSuggestion.getScore() + suggester_boost.getValue()); + suggestionsBySuggested.put(suggestion.getSuggested(), internedSuggestion); + } + } + } + + if (suggestionsBySuggested.size() == 0) { + return null; + } + + Suggestion[] suggestions = suggestionsBySuggested.values().toArray(new Suggestion[suggestionsBySuggested.size()]); + Arrays.sort(suggestions, new Comparator() { + public int compare(Suggestion suggestion, Suggestion suggestion1) { + return Double.compare(suggestion.getScore(), suggestion1.getScore()); + } + }); + + // add to dictionary + SuggestionList suggestionList = suggestionListFactory(query); + suggestionList.addSuggested(suggestions[0].getSuggested(), 1d, suggestions[0].getCorpusQueryResults()); + suggestionsByQuery.put(suggestionList); + return suggestions; + } + + + public Map getPrioritesBySecondLevelSuggester() { + return prioritesBySecondLevelSuggester; + } + + public void setPrioritesBySecondLevelSuggester(Map prioritesBySecondLevelSuggester) { + this.prioritesBySecondLevelSuggester = prioritesBySecondLevelSuggester; + } + + /** + * Used to extract bootstrapped a priori corpus from the dictionary + * @return a map where suggestion is key and the value is a list of misspelled words that suggests the key. + * @throws DatabaseException + * @see org.apache.lucene.search.didyoumean.SuggestionFacade#secondLevelSuggestionFactory(org.apache.lucene.index.facade.IndexFacade,org.apache.lucene.index.facade.IndexFacade) + */ + public Map inverted() throws DatabaseException { + + // todo use a temporary bdb for this so we dont run out of memory + + Map inverted = new HashMap(); + + for (Map.Entry e : getSuggestionsByQuery().map().entrySet()) { + Suggestion s = e.getValue().get(0); + SuggestionList sl = inverted.get(s.getSuggested()); + if (sl == null) { + sl = new SuggestionList(s.getSuggested()); + inverted.put(s.getSuggested(), sl); + } + sl.addSuggested(e.getKey(), s.getScore(), s.getCorpusQueryResults()); + } + + + return inverted; + } + + + public PrimaryIndex getSuggestionsByQuery() { + return suggestionsByQuery; + } + + + /** + * Scans the dictionary for queries that suggests a query + * that in their own turn suggest something else. + * These bad suggestions will be replaced by the final suggestion, + * so that the suggester don't have to spend clock ticks doing this in real time. + * + * @param suggester + */ + public void optimize(Suggester suggester) throws DatabaseException { + // todo + } + + /** + * Removes excess suggestions that probably never be suggested, + * for instance those with too great distance from top suggestion. + */ + public void prune(int maxSize) throws DatabaseException { + EntityCursor cursor = getSuggestionsByQuery().entities(); + SuggestionList suggestionList; + while ((suggestionList = cursor.next()) != null) { + if (suggestionList.size() > maxSize) { + for (int i = maxSize; i < suggestionList.size(); i++) { + suggestionList.getSuggestions().remove(i); + } + getSuggestionsByQuery().put(suggestionList); + } + } + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.png =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.png ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.png =================================================================== Cannot display: file marked as a binary type. svn:mime-type = application/octet-stream Property changes on: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.png ___________________________________________________________________ Name: svn:mime-type + application/octet-stream Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.dot =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.dot (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.dot (revision 0) @@ -0,0 +1,20 @@ +digraph goalTreeA +{ + nodeA[shape=box label="heroes of knight and magic"]; + nodeB[shape=box label="heroes of light and magic"]; + nodeC[shape=box label="heroes of night and magic"]; + nodeD[shape=box label="heroes of might and magic"]; + nodeE[shape=box label="homm"]; + nodeA->nodeB + nodeA->nodeC + nodeC->nodeD + nodeD->nodeE + inspectionD1[label="inspected 1st result"]; + inspectionD2[label="commented 3rd result"]; + inspectionD1->nodeD; + inspectionD2->nodeD; + inspectionE1[label="inspected 1st result"]; + inspectionE2[label="inspected 2nd result"]; + inspectionE1->nodeE; + inspectionE2->nodeE; +} \ No newline at end of file Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.dot =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.dot (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.dot (revision 0) @@ -0,0 +1,33 @@ +digraph goalTreeA +{ + nodeA[shape=box label="heroes of knight and magic" color="red"]; + nodeB[shape=box label="heroes of light and magic" color="red"]; + nodeC[shape=box label="heroes of night and magic" color="red"]; + nodeD[shape=box label="heroes of might and magic" color="green"]; + nodeE[shape=box label="homm" color="green"]; + nodeA->nodeB[label="edit distance"]; + nodeA->nodeC[label="edit distance"]; + nodeC->nodeD[label="edit distance"]; + nodeD->nodeE[label="short time"]; + + inspectionD1[label="inspected 1st result"]; + inspectionD2[label="commented 3rd result"]; + inspectionD1->nodeD + inspectionD2->nodeD; + inspectionE1[label="inspected 1st result"]; + inspectionE2[label="inspected 2nd result"]; + inspectionE1->nodeE; + inspectionE2->nodeE; + + suggestionA[label="heroes of might and magic" color="yellow"]; + nodeA->suggestionA[label="adapt++"]; + suggestionB[label="heroes of might and magic" color="yellow"]; + nodeB->suggestionB[label="adapt++"]; + suggestionC[label="heroes of might and magic" color="yellow"]; + nodeC->suggestionC[label="adapt++"]; + suggestionD[label="homm" color="yellow"]; + nodeD->suggestionD[label="adapt++"]; + suggestionE[label="heroes of might and magic" color="yellow"]; + nodeE->suggestionE[label="adapt++"]; + +} \ No newline at end of file Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/classdiagram.uxf =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/classdiagram.uxf (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/classdiagram.uxf (revision 0) @@ -0,0 +1,489 @@ + + + + com.umlet.element.base.Class + + 780 + 290 + 200 + 90 + + QueryGoalNode<R> + -- + +query:String + +timeStamp:long + +suggestions:Suggestion[] + -- + + + + + + com.umlet.element.base.Class + + 760 + 0 + 230 + 160 + + QueryGoalNode<R>.Inspection + -- + +reference:R + +timeStamp:long + _+NOT_GOAL:Double=-1d_ + _+NEUTRAL:Double=0d_ + _+GOAL:Double=1d_ + _+MOO:Double=null _ + +classification:Double + -- + + + + + com.umlet.element.base.Relation + + 813 + 140 + 134 + 170 + + lt=<<<<-> + m2=0..* + <<ordered>> + 67;150;67;20 + + + com.umlet.element.base.Relation + + 860 + 310 + 277 + 180 + + lt=<<<- + m2=0..* + r2=child + m1=1 + r1=parent + <<ordered>> + 120;20;210;20;210;160;20;160;20;70 + + + com.umlet.element.base.Relation + + 650 + 360 + 170 + 120 + + lt=<. + <<process>> + 150;20;20;100 + + + com.umlet.element.base.Class + + 140 + 790 + 160 + 30 + + DefaultSuggester<R> + + + + com.umlet.element.base.Relation + + 540 + 280 + 40 + 170 + + lt=<<. + 20;150;20;20 + + + com.umlet.element.base.Class + + 400 + 270 + 280 + 30 + + DefaultQueryGoalTreeExtractor<R> + + + + + com.umlet.element.base.Note + + 160 + 600 + 220 + 30 + + The consumer level interface + bg=blue + + + + + com.umlet.element.base.Relation + + 540 + 460 + 40 + 160 + + lt=-> + r2=1 + + 20;140;20;20 + + + com.umlet.element.base.Relation + + 540 + 610 + 40 + 180 + + lt=-> + r2=1 + + 20;20;20;160 + + + com.umlet.element.base.Class + + 440 + 430 + 230 + 50 + + <<interface>> + QueryGoalTreeExtractor<R> + + + + + com.umlet.element.base.Class + + 480 + 600 + 150 + 30 + + SuggestionsFacade + + + + + com.umlet.element.base.Relation + + 610 + 590 + 190 + 40 + + lt=-> + r2=1 + + 20;20;170;20 + + + com.umlet.element.base.Class + + 420 + 770 + 280 + 80 + + <<interface>> + Suggester<R> + -- + +didYouMean(String, int):Suggestion[] + +didYouMean(String):String + + + + + com.umlet.element.base.Relation + + 980 + 600 + 180 + 40 + + lt=<<. + 20;20;160;20 + + + com.umlet.element.base.Relation + + 832 + 640 + 136 + 170 + + lt=<. + <<updates>> + 68;150;68;20 + + + com.umlet.element.base.Class + + 780 + 570 + 220 + 90 + + <<interface>> + Trainer<R> + -- + +queue(QueryGoalNode<R>) + +flush(Dictionary); + + + + + com.umlet.element.base.Class + + 1140 + 610 + 150 + 30 + + DefaultTrainer<R> + + + + com.umlet.element.base.Class + + 1170 + 770 + 170 + 70 + + Suggestion + -- + +suggested:String + +score:double + -- + + + + + com.umlet.element.base.Relation + + 920 + 776 + 270 + 54 + + lt=<<<<-> + q1=key + m2=0..* + <<ordered>> + 20;34;250;34 + + + com.umlet.element.base.Note + + 350 + 1270 + 280 + 30 + + Basically the old contrib/spell checker + bg=blue + + + + + com.umlet.element.base.Relation + + 610 + 1270 + 210 + 40 + + lt=. + 190;20;20;20 + + + com.umlet.element.base.Class + + 800 + 1270 + 190 + 30 + + NgramTokenSuggester + + + + com.umlet.element.base.Class + + 840 + 1170 + 120 + 30 + + TokenSuggester + + + + com.umlet.element.base.Class + + 810 + 1070 + 180 + 30 + + TokenPhraseSuggester + + + + com.umlet.element.base.Relation + + 880 + 980 + 40 + 110 + + lt=<<. + 20;20;20;90 + + + com.umlet.element.base.Relation + + 880 + 1080 + 40 + 110 + + lt=-> + m2=1 + + 20;20;20;90 + + + com.umlet.element.base.Relation + + 360 + 590 + 140 + 40 + + lt=. + + 120;20;20;20 + + + com.umlet.element.base.Relation + + 880 + 1180 + 40 + 110 + + lt=<<- + 20;20;20;90 + + + com.umlet.element.base.Class + + 760 + 930 + 280 + 70 + + <<interface>> + SecondLevelSuggester + -- + +didYouMean(String, int):Suggestion[] + + + + + com.umlet.element.base.Relation + + 880 + 800 + 40 + 150 + + lt=<<<<-> + m2=0..* + + 20;20;20;130 + + + com.umlet.element.base.Relation + + 680 + 776 + 190 + 54 + + lt=<. + <<navigates>> + 170;34;20;34 + + + com.umlet.element.base.Relation + + 280 + 780 + 160 + 40 + + lt=<<. + 140;20;20;20 + + + com.umlet.element.base.Relation + + 380 + 820 + 50 + 40 + + lt=<<. + -80;-10;-80;-10 + + + com.umlet.element.base.Relation + + 880 + 860 + 110 + 40 + + lt=. + 20;20;90;20 + + + com.umlet.element.base.Class + + 970 + 860 + 140 + 30 + + _weight:double_ + + + + com.umlet.element.base.Class + + 850 + 790 + 90 + 30 + + Dictionary + + + \ No newline at end of file Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggestion.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggestion.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggestion.java (revision 0) @@ -0,0 +1,106 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import com.sleepycat.persist.model.Entity; +import com.sleepycat.persist.model.Persistent; + +import java.io.Serializable; +import java.text.DecimalFormat; + +/** + * A scored string value, a suggestion in a dictionary. + * + * @author Karl Wettin + * Date: Jul 31, 2006 + * Time: 4:59:06 PM + *

+ */ +@Persistent +public class Suggestion implements Comparable { + + private String suggested; + private double score = 1d; + private Integer corpusQueryResults; + + + public Suggestion() { + } + + public Suggestion(String suggested) { + this.suggested = suggested; + } + + + public Suggestion(String suggested, double score) { + this.suggested = suggested; + this.score = score; + } + + + public Suggestion(String suggested, double score, Integer corpusQueryResults) { + this.suggested = suggested; + this.score = score; + this.corpusQueryResults = corpusQueryResults; + } + + /** + * + * @param suggestion suggestion this instance to be compared to + * @return the suggestion scores compared. + */ + public int compareTo(Suggestion suggestion) { + return Double.compare(suggestion.getScore(), getScore()); + } + + + public Integer getCorpusQueryResults() { + return corpusQueryResults; + } + + public void setCorpusQueryResults(Integer corpusQueryResults) { + this.corpusQueryResults = corpusQueryResults; + } + + public String getSuggested() { + return suggested; + } + + public void setSuggested(String suggested) { + this.suggested = suggested; + } + + public double getScore() { + return score; + } + + public void setScore(double score) { + this.score = score; + } + + private static final DecimalFormat df = new DecimalFormat("#.##"); + + public String toString() { + StringBuilder sb = new StringBuilder(); + sb.append(df.format(getScore())); + sb.append('\t'); + sb.append(getSuggested()); + return sb.toString(); + } + + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AprioriCorpusFactory.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AprioriCorpusFactory.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AprioriCorpusFactory.java (revision 0) @@ -0,0 +1,47 @@ +package org.apache.lucene.search.didyoumean; + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.facade.IndexFacade; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Extracts an a priori corpus from a dictionary. + * + * @author karl wettin + * Date: 2007-okt-19 + * Time: 02:07:34 + */ +public interface AprioriCorpusFactory { + + /** + * Initializes and populates an a priori index with documents whos text is known to be correct. + * + * @param dictionary dictionary to extract data from. + * @param suggester suggester used to navigate the dictionary. + * @param aprioriIndex lucene index, will be created/reinitialized. + * @param aprioriIndexField index field used to store the a priori text. + * @param aprioriAnalyzer analyzer used to tokenize a priori text. + * @throws IOException + * @throws DatabaseException + */ + public abstract void factory(Dictionary dictionary, Suggester suggester, IndexFacade aprioriIndex, String aprioriIndexField, Analyzer aprioriAnalyzer) throws IOException, DatabaseException; + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SecondLevelSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SecondLevelSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SecondLevelSuggester.java (revision 0) @@ -0,0 +1,35 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * When the dictionary is not aware of any suggestion the second level + * suggesters are called upon to find the best suggestions and insert + * them to the dictionary. + * + * @author Karl Wettin + * Date: 2007-jan-30 + * Time: 06:05:54 + */ +public interface SecondLevelSuggester { + public abstract SuggestionPriorityQueue suggest(String query); + + /** + * @return true if suggestions from this suggester should be persisted in the dictionary + */ + public abstract boolean hasPersistableSuggestions(); +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AbstractSuggester.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AbstractSuggester.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AbstractSuggester.java (revision 0) @@ -0,0 +1,40 @@ +package org.apache.lucene.search.didyoumean; + +import com.sleepycat.je.DatabaseException; +import org.apache.lucene.search.didyoumean.dictionary.Dictionary; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * Abstract suggester. + * + * @author Karl Wettin + * Date: 2007-feb-17 + * Time: 02:10:42 + */ +public abstract class AbstractSuggester implements Suggester { + + public String didYouMean(Dictionary dictionary, String query) throws DatabaseException { + Suggestion[] suggestions = didYouMean(dictionary, query, 1); + if (suggestions == null || suggestions.length == 0) { + return null; + } else { + return suggestions[0].getSuggested(); + } + } + + +} Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionPriorityQueue.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionPriorityQueue.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionPriorityQueue.java (revision 0) @@ -0,0 +1,50 @@ +package org.apache.lucene.search.didyoumean; + +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +import org.apache.lucene.util.PriorityQueue; + +import java.util.LinkedList; + +/** + * User: kalle + * Date: 2007-mar-03 + * Time: 13:14:38 + */ +public class SuggestionPriorityQueue extends PriorityQueue { + + public SuggestionPriorityQueue(int maxSize) { + initialize(maxSize); + } + + + protected boolean lessThan(Object a, Object b) { + Suggestion s0 = (Suggestion) a; + Suggestion s1 = (Suggestion) b; + return s0.compareTo(s1) == -1; + } + + public Suggestion[] toArray() { + LinkedList list = new LinkedList(); + Suggestion s; + while ((s = (Suggestion) pop()) != null) { + list.add(s); + } + return list.toArray(new Suggestion[list.size()]); + } + +} Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexFacade.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexFacade.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexFacade.java (revision 0) @@ -0,0 +1,44 @@ +package org.apache.lucene.index.facade; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-aug-17 + * Time: 00:17:14 + */ +public class DirectoryIndexFacade extends IndexFacade { + + private Directory directory; + + public DirectoryIndexFacade(Directory directory) throws IOException { + this.directory = directory; + } + + public IndexReader indexReaderFactory() throws IOException { + return IndexReader.open(directory); + } + + public IndexWriterFacade indexWriterFactory(Analyzer analyzer, boolean create) throws IOException { + return new DirectoryIndexWriterFacade(directory, analyzer, create); + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacadeFactory.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacadeFactory.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacadeFactory.java (revision 0) @@ -0,0 +1,29 @@ +package org.apache.lucene.index.facade; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-okt-23 + * Time: 20:37:44 + */ +public interface IndexFacadeFactory { + + public abstract IndexFacade factory() throws IOException; + +} Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/InstantiatedIndexFacade.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/index/facade/InstantiatedIndexFacade.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/InstantiatedIndexFacade.java (revision 0) @@ -0,0 +1,66 @@ +package org.apache.lucene.index.facade; + +import org.apache.lucene.store.instantiated.InstantiatedIndex; +import org.apache.lucene.store.instantiated.InstantiatedIndexWriter; +import org.apache.lucene.document.Document; +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-aug-17 + * Time: 03:51:57 + */ +public class InstantiatedIndexFacade + extends IndexFacade { + + private InstantiatedIndex ii; + + public InstantiatedIndexFacade(InstantiatedIndex ii) { + this.ii = ii; + } + + public IndexReader indexReaderFactory() throws IOException { + return ii.indexReaderFactory(); + } + + public IndexWriterFacade indexWriterFactory(final Analyzer analyzer, final boolean create) throws IOException { + return new IndexWriterFacade() { + + private InstantiatedIndexWriter iw = ii.indexWriterFactory(analyzer, create); + + public void addDocument(Document document, Analyzer analyzer) throws IOException { + iw.addDocument(document, analyzer); + } + + public void addDocument(Document document) throws IOException { + iw.addDocument(document); + } + + public void close() throws IOException { + iw.close(); + } + + public void optimize() throws IOException { + + } + }; + } +} Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexWriterFacade.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexWriterFacade.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexWriterFacade.java (revision 0) @@ -0,0 +1,38 @@ +package org.apache.lucene.index.facade; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-aug-16 + * Time: 23:23:39 + */ +public abstract class IndexWriterFacade { + + public abstract void addDocument(Document document, Analyzer analyzer) throws IOException; + + public abstract void addDocument(Document document) throws IOException; + + public abstract void close() throws IOException; + + public abstract void optimize() throws IOException; + +} Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacade.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacade.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacade.java (revision 0) @@ -0,0 +1,34 @@ +package org.apache.lucene.index.facade; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.IndexReader; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-aug-16 + * Time: 23:23:22 + */ +public abstract class IndexFacade { + + public abstract IndexReader indexReaderFactory() throws IOException; + + public abstract IndexWriterFacade indexWriterFactory(Analyzer analyzer, boolean create) throws IOException; + +} Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexWriterFacade.java =================================================================== --- contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexWriterFacade.java (revision 0) +++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexWriterFacade.java (revision 0) @@ -0,0 +1,53 @@ +package org.apache.lucene.index.facade; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.store.Directory; +import org.apache.lucene.index.IndexWriter; + +import java.io.IOException; +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ + + +/** + * @author karl wettin + * Date: 2007-aug-17 + * Time: 00:18:25 + */ +public class DirectoryIndexWriterFacade extends IndexWriterFacade { + + private IndexWriter indexWriter; + + public DirectoryIndexWriterFacade(Directory directory, Analyzer analyzer, boolean create) throws IOException { + indexWriter = new IndexWriter(directory, analyzer, create); + } + + public void addDocument(Document document, Analyzer analyzer) throws IOException { + indexWriter.addDocument(document, analyzer); + } + + public void addDocument(Document document) throws IOException { + indexWriter.addDocument(document); + } + + public void close() throws IOException { + indexWriter.close(); + } + + public void optimize() throws IOException { + indexWriter.optimize(); + } +} Index: contrib/didyoumean/build.xml =================================================================== --- contrib/didyoumean/build.xml (revision 0) +++ contrib/didyoumean/build.xml (revision 0) @@ -0,0 +1,42 @@ + + + + + + + + Extended spell checker with phrase support and adaptive user session analysis. + + + + + + + + + + + + + + + + + + +