Index: contrib/didyoumean/pom.xml.template
===================================================================
--- contrib/didyoumean/pom.xml.template (revision 0)
+++ contrib/didyoumean/pom.xml.template (revision 0)
@@ -0,0 +1,61 @@
+
+
+
+
+ 4.0.0
+
+ org.apache.lucene
+ lucene-contrib
+ @version@
+
+ org.apache.lucene
+ lucene-didyoumean
+ Lucene query suggester
+ @version@
+
+ Extended spell checker with phrase support and adaptive user session analysis.
+
+ jar
+
+
+
+ berkeleydb
+ je
+ 3.2.44
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+
+ 1.5
+ 1.5
+
+
+
+
+
+
+
Index: contrib/didyoumean/lib/je-3.2.44.jar
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib/didyoumean/lib/je-3.2.44.jar
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultGoalTreeExtractor.java
===================================================================
--- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultGoalTreeExtractor.java (revision 0)
+++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultGoalTreeExtractor.java (revision 0)
@@ -0,0 +1,33 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+
+import org.apache.lucene.search.didyoumean.impl.DefaultQueryGoalTreeExtractor;
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-02
+ * Time: 01:03:25
+ */
+public class TestDefaultGoalTreeExtractor extends TestQueryGoalExtractor {
+
+
+ protected org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor goalTreeExtractorFactory() {
+ return new DefaultQueryGoalTreeExtractor();
+ }
+}
Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestGoalJuror.java
===================================================================
--- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestGoalJuror.java (revision 0)
+++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestGoalJuror.java (revision 0)
@@ -0,0 +1,272 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import junit.framework.TestCase;
+import org.apache.lucene.search.didyoumean.QuerySession;
+import org.apache.lucene.search.didyoumean.SecondLevelSuggester;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.SuggestionFacade;
+
+import java.io.*;
+import java.net.URL;
+import java.util.Map;
+import java.util.zip.GZIPInputStream;
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-24
+ * Time: 00:13:05
+ */
+public class TestGoalJuror extends TestCase {
+
+ // private static Log log = LogFactory.getLog(TestGoalJuror.class);
+ // private static long serialVersionUID = 1l;
+
+ private File dataRootPath = new File("data");
+
+ @Override
+ protected void setUp() throws Exception {
+ if (!dataRootPath.exists()) {
+ dataRootPath.mkdirs();
+ }
+ }
+
+ /** left out here for closure reasons */
+ private SuggestionFacade facade;
+
+
+ public void testConcept() throws Exception {
+
+ facade = new SuggestionFacade(new File(dataRootPath, "TestGoalJuror.testConcept/" + String.valueOf(System.currentTimeMillis())));
+
+ QuerySession session = facade.getQuerySessionManager().querySessionFactory();
+ session.query("heroes of night and magic", 0, null, 1l);
+ session.query("heroes of knight and magic", 0, null, 2l);
+ session.query("heroes of might and magic", 10, null, 3l);
+
+ assertTrue(session.isExpired());
+ facade.getQuerySessionManager().getSessionsByID().put(session);
+
+ facade.trainExpiredQuerySessions();
+
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of night and magic"));
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of knight and magic"));
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of might and magic"));
+
+ facade.close();
+
+ }
+
+
+ public void testImportData() throws Exception {
+
+ // load 3,500,000 user queries with session id, time stamp and number of hits.
+ // no goals specified. let the goal juror figure that out.
+ // all queries are lower cased.
+
+ File facadePath = new File(dataRootPath, "backup");
+ // File facadePath = new File(dataRootPath, "TestGoalJuror.testImportData/" + String.valueOf(System.currentTimeMillis());
+
+ facade = new SuggestionFacade(facadePath);
+
+
+ if (facade.getDictionary().getSuggestionsByQuery().map().size() == 0) {
+
+ File testDataFile = new File(dataRootPath, "queries_grouped.txt");
+ if (!testDataFile.exists()) {
+ System.out.println("Downloading test data...");
+ InputStream input = new GZIPInputStream(new URL("http://ginandtonique.org/~kalle/LUCENE-626/queries_grouped.txt.gz").openStream());
+ OutputStream output = new FileOutputStream(testDataFile);
+ byte[] buf = new byte[102400];
+ int len;
+ while ((len = input.read(buf)) > -1) {
+ output.write(buf, 0, len);
+ }
+ input.close();
+ output.close();
+ }
+
+ System.out.println("Importing test data...");
+
+
+ int cnt = 0;
+ int total = 0;
+
+ BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(testDataFile), "UTF8"));
+ String line;
+
+ assertEquals(5, br.readLine().split("\t").length);
+
+ String sessionID = "";
+ QuerySession querySession = null;
+
+ while ((line = br.readLine()) != null) {
+ String[] values = line.split("\t");
+ if (!values[0].equals(sessionID)) {
+ if (querySession != null) {
+ facade.getQuerySessionManager().getSessionsByID().put(querySession);
+ if (cnt == 10000) {
+
+ // throws a nasty exception!
+ // http://forums.oracle.com/forums/thread.jspa?threadID=577588&tstart=0
+
+// // train while still creating...
+// if (total == 200000) {
+// new Thread(new Runnable() {
+// public void run() {
+// try {
+// facade.trainExpiredQuerySessions(2);
+// } catch (Exception e) {
+// e.printStackTrace();
+// // todo
+// }
+// }
+// }).start();
+// }
+
+ System.out.println(total + " sessions loaded.");
+ cnt = 0;
+ }
+ }
+ querySession = new QuerySession();
+ querySession.setId(values[0]);
+ sessionID = values[0];
+ cnt++;
+ total++;
+ }
+ String query = values[4].trim().toLowerCase();
+ int hits = Integer.valueOf(values[3]);
+ long timestamp = Long.valueOf(values[1]) * 1000;
+ querySession.query(query, hits, null, timestamp);
+ }
+ facade.getQuerySessionManager().getSessionsByID().put(querySession);
+ br.close();
+
+ facade.trainExpiredQuerySessions(4);
+
+ System.out.println("Reopening persistent dictionary.");
+ facade.close();
+ facade = new SuggestionFacade(facadePath);
+
+ }
+
+
+ System.out.println("Running assertation tests...");
+ // run some tests without the second level suggestions,
+ // i.e. user behavioral data only. no ngrams or so.
+
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates ofthe caribbean"));
+
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carribbean"));
+
+ assertEquals("pirates caribbean", facade.didYouMean("pirates carricean"));
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carriben"));
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabien"));
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabbean"));
+
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates og carribean"));
+
+ assertEquals("pirates of the caribbean soundtrack", facade.didYouMean("pirates of the caribbean music"));
+ assertEquals("pirates of the caribbean score", facade.didYouMean("pirates of the caribbean soundtrack"));
+
+ assertEquals("pirate of caribbean", facade.didYouMean("pirate of carabian"));
+ assertEquals("pirates of caribbean", facade.didYouMean("pirate of caribbean"));
+ assertEquals("pirates of caribbean", facade.didYouMean("pirates of caribbean"));
+
+ // depening on how many hits and goals are noted with these two queries
+ // perhaps the delta should be added to a synonym dictionary?
+ assertEquals("homm iv", facade.didYouMean("homm 4"));
+
+
+
+
+
+ // add token phrase suggester: requires a lot of memory!!
+
+ // not yet known.. and we have no second level yet.
+ assertNull(facade.didYouMean("the pilates"));
+
+ System.out.println("Creating second level suggesters from comon typos...");
+ // use the dictionary built from user queries to build the token phrase and ngram suggester.
+
+ facade.getDictionary().getPrioritesBySecondLevelSuggester().putAll(facade.secondLevelSuggestionFactory());
+ System.out.println("Done creating second level suggesters from comon typos...");
+
+ assertEquals("the pirates", facade.didYouMean("the pilates"));
+ // now it's learned, should take no time.
+ assertEquals("the pirates", facade.didYouMean("the pilates"));
+
+
+
+
+ // typos
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of fight and magic"));
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of right and magic"));
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of magic and light"));
+
+ // composite dictionary key not learned yet..
+ assertEquals(null, facade.didYouMean("heroesof lightand magik"));
+ // learn
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of light and magik"));
+ // test
+ assertEquals("heroes of might and magic", facade.didYouMean("heroesof lightand magik"));
+
+ // wrong term order
+ assertEquals("heroes of might and magic", facade.didYouMean("heroes of magic and might"));
+
+
+ System.out.println("Reopening persistent dictionary.");
+ facade.close();
+ facade = new SuggestionFacade(facadePath);
+ System.out.println("Persistent dictionary reopened.");
+
+ // same tests again, to make sure persistency works.
+
+ // should be known by now
+ assertEquals("the pirates", facade.didYouMean("the pilates"));
+
+ // run some tests without the second level suggestions,
+ // i.e. user behavioral data only. no ngrams or so.
+
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates ofthe caribbean"));
+
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carribbean"));
+
+ assertEquals("pirates caribbean", facade.didYouMean("pirates carricean"));
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carriben"));
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabien"));
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates of the carabbean"));
+
+ assertEquals("pirates of the caribbean", facade.didYouMean("pirates og carribean"));
+
+ assertEquals("pirates of the caribbean soundtrack", facade.didYouMean("pirates of the caribbean music"));
+ assertEquals("pirates of the caribbean score", facade.didYouMean("pirates of the caribbean soundtrack"));
+
+ assertEquals("pirate of caribbean", facade.didYouMean("pirate of carabian"));
+ assertEquals("pirates of caribbean", facade.didYouMean("pirate of caribbean"));
+ assertEquals("pirates of caribbean", facade.didYouMean("pirates of caribbean"));
+
+ // depening on how many hits and goals are noted with these two queries
+ // perhaps the delta should be added to a synonym dictionary?
+ assertEquals("homm iv", facade.didYouMean("homm 4"));
+
+
+ }
+
+
+}
Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestQueryGoalExtractor.java
===================================================================
--- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestQueryGoalExtractor.java (revision 0)
+++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestQueryGoalExtractor.java (revision 0)
@@ -0,0 +1,103 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+
+import junit.framework.TestCase;
+
+import java.util.List;
+
+import org.apache.lucene.search.didyoumean.QueryGoalNode;
+
+/**
+ * @author Karl Wettin
+ * Date: Aug 4, 2006
+ * Time: 6:41:38 AM
+ */
+public abstract class TestQueryGoalExtractor extends TestCase {
+
+ private org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor goalTreeExtractor;
+
+ @Override
+ protected void setUp() throws Exception {
+ goalTreeExtractor = goalTreeExtractorFactory();
+ }
+
+ protected abstract org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor goalTreeExtractorFactory();
+
+ public void testSingleNode() throws Exception {
+ assertEquals(1, goalTreeExtractor.extractGoalRoots(new QueryGoalNode(null, "heroes of knight and magic", 10, null, 1l)).size());
+
+ }
+
+ public void testTime() throws Exception {
+
+ QueryGoalNode node;
+
+ QueryGoalNode session = new QueryGoalNode(null, "heroes of knight and magic", 10, null, 1l);
+ node = new QueryGoalNode(session, "heroes of knight and magic", 10, 0l);
+ node = new QueryGoalNode(node, "heroes of night and magic", 13, 10000l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 132, 15000l);
+ node = new QueryGoalNode(node, "heroes of might and magic 3", 12, 25000l);
+ node.new Inspection(null, QueryGoalNode.MOO, 25000l);
+ node = new QueryGoalNode(node, "heroes of might and magic iv", 12, 100000l);
+ node = new QueryGoalNode(node, "heroes of might and magic 4", 6, 105000l);
+ node.new Inspection(null, QueryGoalNode.GOAL, 105000l);
+
+ assertEquals(1, goalTreeExtractor.extractGoalRoots(session).size());
+
+ }
+
+ public void testSimilarity() throws Exception {
+
+ QueryGoalNode node;
+ QueryGoalNode session = new QueryGoalNode(null, "heroes of knight and magic", 10, null, 0l);
+ node = new QueryGoalNode(session, "heroes of night and magic", 13, 10000l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 132, 15000l);
+ node = new QueryGoalNode(node, "heroes of might and magic 3", 12, 25000l);
+
+ node = new QueryGoalNode(node, "the davinci code", 12, 27000l);
+ new QueryGoalNode(node, "the da vinci code", 6, 29000l);
+
+ List> goalTreeRoots = goalTreeExtractor.extractGoalRoots(session);
+
+ assertEquals(2, goalTreeRoots.size());
+ assertEquals(1, goalTreeRoots.get(0).numChildrenRecursive());
+ assertEquals(3, goalTreeRoots.get(1).numChildrenRecursive());
+
+ }
+
+ public void testHits() throws Exception {
+ QueryGoalNode node;
+ QueryGoalNode session = new QueryGoalNode(null, "heroes of knight and magic", 10, null, 0l);
+ node = new QueryGoalNode(session, "heroes of night and magic", 13, 10000l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 132, 15000l);
+ node = new QueryGoalNode(node, "heroes of might and magic 3", 0, 25000l);
+
+ node = new QueryGoalNode(node, "the davinci code", 12, 27000l);
+ new QueryGoalNode(node, "the da vinci code", 6, 29000l);
+
+ List> goalTreeRoots = goalTreeExtractor.extractGoalRoots(session);
+
+ assertEquals(2, goalTreeRoots.size());
+ assertEquals(1, goalTreeRoots.get(0).numChildrenRecursive());
+ assertEquals(3, goalTreeRoots.get(1).numChildrenRecursive());
+
+
+ }
+
+}
Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultImplementation.java
===================================================================
--- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultImplementation.java (revision 0)
+++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/impl/TestDefaultImplementation.java (revision 0)
@@ -0,0 +1,235 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+
+import junit.framework.TestCase;
+import org.apache.lucene.search.didyoumean.QueryGoalNode;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.SuggestionFacade;
+import org.apache.lucene.search.didyoumean.secondlevel.token.SecondLevelTokenPhraseSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.NgramTokenSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.TermEnumIterator;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.index.facade.DirectoryIndexFacade;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.analysis.WhitespaceAnalyzer;
+import org.apache.lucene.store.RAMDirectory;
+
+import java.io.File;
+
+/**
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 12:26:07 PM
+ */
+public class TestDefaultImplementation extends TestCase {
+
+ private SuggestionFacade suggestionFacade;
+
+ @Override
+ protected void setUp() throws Exception {
+
+ suggestionFacade = new SuggestionFacade(new File("data/TestDefaultImplementation/" + String.valueOf(System.currentTimeMillis())));
+
+ // your primary index that suggestions must match.
+ IndexFacade aprioriIndex = new DirectoryIndexFacade(new RAMDirectory());
+ aprioriIndex.indexWriterFactory(null, true).close();
+ String aprioriField = "title";
+
+ // build the ngram suggester
+ IndexFacade ngramIndex = new DirectoryIndexFacade(new RAMDirectory());
+ ngramIndex.indexWriterFactory(null, true).close();
+ NgramTokenSuggester ngramSuggester = new NgramTokenSuggester(ngramIndex);
+
+ IndexReader aprioriReader = aprioriIndex.indexReaderFactory();
+ ngramSuggester.indexDictionary(new TermEnumIterator(aprioriReader, aprioriField));
+
+ // the greater the better results but with a longer response time.
+ int maxSuggestionsPerToken = 3;
+
+ // add ngram suggester wrapped in a single token phrase suggester as second level suggester.
+ suggestionFacade.getDictionary().getPrioritesBySecondLevelSuggester().put(new SecondLevelTokenPhraseSuggester(ngramSuggester, aprioriField, false, maxSuggestionsPerToken, new WhitespaceAnalyzer(), aprioriIndex), 1d);
+ }
+
+ public void testBasicTraining() throws Exception {
+ QueryGoalNode node;
+
+ node = new QueryGoalNode(null, "heroes of nmight and magic", 3);
+ node = new QueryGoalNode(node, "heroes of night and magic", 3);
+ node = new QueryGoalNode(node, "heroes of might and magic", 10);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 3);
+ node = new QueryGoalNode(node, "heroes of knight and magic", 7);
+ node = new QueryGoalNode(node, "heroes of might and magic", 20);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ node = new QueryGoalNode(null, "heroes of might and magic", 20, 1l);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 7, 0l);
+ node = new QueryGoalNode(node, "heroes of light and magic", 14, 1l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 2, 6l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 4, 0l);
+ node = new QueryGoalNode(node, "heroes of knight and magic", 17, 1l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 2, 2l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes of light and magic"));
+ assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes of night and magic"));
+
+ assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes ofnight andmagic"));
+
+ // todo fight has not been trained. use trie .
+ //assertEquals("heroes of might and magic", spellChecker.didYouMean(didYouMean.getDictionary(), "heroes of fight and magic"));
+
+
+ }
+
+
+ public void testSynonymTraining() throws Exception {
+ QueryGoalNode node;
+
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 12, 0l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 30, 1l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 13, 0l);
+ node = new QueryGoalNode(node, "heroes of knight and magic", 14, 1l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 22, 2l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ node = new QueryGoalNode(node, "homm", 17, 4l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+
+ assertEquals("heroes of might and magic", suggestionFacade.didYouMean("homm"));
+
+
+ node = new QueryGoalNode(null, "homm", 17, 0l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 12, 0l);
+ node = new QueryGoalNode(node, "heroes of light and magic", 4, 1l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 17, 2l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ node = new QueryGoalNode(node, "homm", 34, 4l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ node.new Inspection(24, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+
+ node = new QueryGoalNode(null, "heroes of night and magic", 3, 0l);
+ node = new QueryGoalNode(node, "heroes of knight and magic", 3, 1l);
+ node = new QueryGoalNode(node, "heroes of might and magic", 22, 2l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ assertEquals("homm", suggestionFacade.didYouMean("heroes of might and magic"));
+ assertEquals("heroes of might and magic", suggestionFacade.didYouMean("heroes of night and magic"));
+
+ // todo fight has not been trained. navigate the shortest way down the trie to find suggestion?
+ //assertEquals("heroes of might and magic", spellChecker.didYouMean(didYouMean.getDictionary(), "heroes of fight and magic"));
+
+ assertEquals("homm", suggestionFacade.didYouMean("heroes of might and magic"));
+ assertEquals("heroes of might and magic", suggestionFacade.didYouMean("homm"));
+
+ }
+
+ public void testPrefixedSynonymTraining() throws Exception {
+ QueryGoalNode node;
+
+
+ node = new QueryGoalNode(null, "heroes of might and magic 2", 22, 0l);
+ node = new QueryGoalNode(node, "heroes of might and magic ii", 13, 1l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+
+
+ node = new QueryGoalNode(null, "heroes of might and magic 2", 22, 0l);
+ node = new QueryGoalNode(node, "heroes of might and magic ii", 13, suggestionFacade.didYouMean("heroes of might and magic 2", 1)[0].getSuggested(), 1l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+
+
+ node = new QueryGoalNode(null, "heroes of might and magic ii", 22, 0l);
+ node = new QueryGoalNode(node, "heroes of might and magic 2", 13, suggestionFacade.didYouMean("heroes of might and magic ii", 1)[0].getSuggested(), 1l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ node = new QueryGoalNode(null, "heroes of might and magic ii", 22, 0l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ node = new QueryGoalNode(node, "heroes of might and magic 2", 13, suggestionFacade.didYouMean("heroes of might and magic ii", 1)[0].getSuggested(), 2l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ assertEquals("heroes of might and magic 2", suggestionFacade.didYouMean("heroes of might and magic ii"));
+ assertEquals("heroes of might and magic ii", suggestionFacade.didYouMean("heroes of might and magic 2"));
+
+ }
+
+ public void testNegativeAdaptation() throws Exception {
+
+ testSynonymTraining();
+
+
+ Suggestion aprioriSuggestion = suggestionFacade.getDictionary().getSuggestions("heroesofmightandmagic").get(0);
+ assertEquals("homm", aprioriSuggestion.getSuggested());
+
+ for (int i = 0; i < 10; i++) {
+ QueryGoalNode node;
+ node = new QueryGoalNode(null, "heroes of might and magic", 22, 0l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+ }
+
+ Suggestion resultSuggestion = suggestionFacade.getDictionary().getSuggestions("heroesofmightandmagic").get(0);
+
+ assertFalse("homm".equals(resultSuggestion.getSuggested()));
+
+ }
+
+
+ public void testWhiteSpace() throws Exception {
+
+ QueryGoalNode node;
+
+ node = new QueryGoalNode(null, "the davinci code", 22, 0l);
+ node = new QueryGoalNode(node, "the da vinci code", 13, 1l);
+ node.new Inspection(23, QueryGoalNode.GOAL);
+ suggestionFacade.trainSessionQueryTree(node);
+
+ assertEquals("the da vinci code", suggestionFacade.didYouMean("thedavincicode"));
+ assertEquals("the da vinci code", suggestionFacade.didYouMean("the dav-inci code"));
+
+ }
+
+}
Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TestNgramTokenSuggester.java
===================================================================
--- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TestNgramTokenSuggester.java (revision 0)
+++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TestNgramTokenSuggester.java (revision 0)
@@ -0,0 +1,190 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token.ngram;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.SimpleAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.index.facade.IndexWriterFacade;
+import org.apache.lucene.index.facade.DirectoryIndexFacade;
+import org.apache.lucene.store.RAMDirectory;
+import org.apache.lucene.search.didyoumean.Suggestion;
+
+import java.io.IOException;
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-03
+ * Time: 06:27:55
+ */
+public class TestNgramTokenSuggester extends TestCase {
+
+ // private static Log log = LogFactory.getLog(TestNgramTokenSuggester.class);
+ // private static long serialVersionUID = 1l;
+
+ private NgramTokenSuggester ngramTokenSuggester;
+ private IndexFacade aprioriIndex;
+
+ protected void setUp() throws Exception {
+ super.setUp();
+
+ //create a user index
+ aprioriIndex = new DirectoryIndexFacade(new RAMDirectory());
+ IndexWriterFacade writer = aprioriIndex.indexWriterFactory(new SimpleAnalyzer(), true);
+
+ for (int i = 0; i < 1000; i++) {
+ Document doc = new Document();
+ doc.add(new Field("field1", intToEnglish(i), Field.Store.YES, Field.Index.TOKENIZED));
+ doc.add(new Field("field2", intToEnglish(i + 1), Field.Store.YES, Field.Index.TOKENIZED));
+ writer.addDocument(doc);
+ }
+
+
+ writer.close();
+
+ // create the spellChecker
+ IndexFacade ngramIndex = new DirectoryIndexFacade(new RAMDirectory());
+ ngramIndex.indexWriterFactory(null, true).close();
+ ngramTokenSuggester = new NgramTokenSuggester(ngramIndex);
+ }
+
+
+ public void testBuild() {
+ try {
+
+ IndexReader reader = aprioriIndex.indexReaderFactory();
+
+ addwords(reader, "field1");
+ int num_field1 = ngramTokenSuggester.getNgramReader().numDocs();
+
+ addwords(reader, "field2");
+ int num_field2 = ngramTokenSuggester.getNgramReader().numDocs();
+
+
+ assertEquals(num_field2, num_field1 + 1);
+
+ // test small word
+
+ assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fvie", 2).top()).getSuggested());
+ assertEquals("nine", ((Suggestion)ngramTokenSuggester.suggest("five", 2).top()).getSuggested());
+ assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fiv", 2).top()).getSuggested());
+ assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("ive", 20).top()).getSuggested());
+ assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fives", 20).top()).getSuggested());
+ assertEquals("five", ((Suggestion)ngramTokenSuggester.suggest("fie", 20).top()).getSuggested());
+ assertEquals(0, ngramTokenSuggester.suggest("fi", 20).size());
+
+ // test restraint to a field
+ assertEquals(0, ngramTokenSuggester.suggest("tousand", 100, false, reader, "field1", false).size());
+ assertEquals(1, ngramTokenSuggester.suggest("tousand", 100, false, reader, "field2", false).size());
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ fail();
+ }
+ }
+
+
+ private void addwords(IndexReader r, String field) throws IOException {
+ long time = System.currentTimeMillis();
+ ngramTokenSuggester.indexDictionary(new TermEnumIterator(r, field));
+ time = System.currentTimeMillis() - time;
+ //System.out.println("time to build " + field + ": " + time);
+ }
+
+
+ public static String intToEnglish(int i) {
+ StringBuffer result = new StringBuffer();
+ intToEnglish(i, result);
+ return result.toString();
+ }
+
+ public static void intToEnglish(int i, StringBuffer result) {
+ if (i == 0) {
+ result.append("zero");
+ return;
+ }
+ if (i < 0) {
+ result.append("minus ");
+ i = -i;
+ }
+ if (i >= 1000000000) { // billions
+ intToEnglish(i/1000000000, result);
+ result.append("billion, ");
+ i = i%1000000000;
+ }
+ if (i >= 1000000) { // millions
+ intToEnglish(i/1000000, result);
+ result.append("million, ");
+ i = i%1000000;
+ }
+ if (i >= 1000) { // thousands
+ intToEnglish(i/1000, result);
+ result.append("thousand, ");
+ i = i%1000;
+ }
+ if (i >= 100) { // hundreds
+ intToEnglish(i/100, result);
+ result.append("hundred ");
+ i = i%100;
+ }
+ if (i >= 20) {
+ switch (i/10) {
+ case 9 : result.append("ninety"); break;
+ case 8 : result.append("eighty"); break;
+ case 7 : result.append("seventy"); break;
+ case 6 : result.append("sixty"); break;
+ case 5 : result.append("fifty"); break;
+ case 4 : result.append("forty"); break;
+ case 3 : result.append("thirty"); break;
+ case 2 : result.append("twenty"); break;
+ }
+ i = i%10;
+ if (i == 0)
+ result.append(" ");
+ else
+ result.append("-");
+ }
+ switch (i) {
+ case 19 : result.append("nineteen "); break;
+ case 18 : result.append("eighteen "); break;
+ case 17 : result.append("seventeen "); break;
+ case 16 : result.append("sixteen "); break;
+ case 15 : result.append("fifteen "); break;
+ case 14 : result.append("fourteen "); break;
+ case 13 : result.append("thirteen "); break;
+ case 12 : result.append("twelve "); break;
+ case 11 : result.append("eleven "); break;
+ case 10 : result.append("ten "); break;
+ case 9 : result.append("nine "); break;
+ case 8 : result.append("eight "); break;
+ case 7 : result.append("seven "); break;
+ case 6 : result.append("six "); break;
+ case 5 : result.append("five "); break;
+ case 4 : result.append("four "); break;
+ case 3 : result.append("three "); break;
+ case 2 : result.append("two "); break;
+ case 1 : result.append("one "); break;
+ case 0 : result.append(""); break;
+ }
+ }
+
+
+
+}
Index: contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/TestTokenPhraseSuggester.java
===================================================================
--- contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/TestTokenPhraseSuggester.java (revision 0)
+++ contrib/didyoumean/src/test/org/apache/lucene/search/didyoumean/secondlevel/TestTokenPhraseSuggester.java (revision 0)
@@ -0,0 +1,104 @@
+package org.apache.lucene.search.didyoumean.secondlevel;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import junit.framework.TestCase;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.*;
+import org.apache.lucene.index.facade.IndexWriterFacade;
+import org.apache.lucene.index.facade.DirectoryIndexFacade;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.search.didyoumean.secondlevel.token.TokenPhraseSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.TokenPhraseSuggesterImpl;
+import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.NgramTokenSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.TermEnumIterator;
+import org.apache.lucene.store.RAMDirectory;
+
+import java.util.Collections;
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-02
+ * Time: 04:36:49
+ */
+public class TestTokenPhraseSuggester extends TestCase {
+
+ // private static Log log = LogFactory.getLog(TestNgramPhraseSuggester.class);
+ // private static long serialVersionUID = 1l;
+
+
+ public void testPhraseSuggester() throws Exception {
+
+ IndexFacade aprioriIndex = new DirectoryIndexFacade(new RAMDirectory());
+ aprioriIndex.indexWriterFactory(null, true).close();
+
+ Analyzer analyzer = new StandardAnalyzer(Collections.EMPTY_SET);
+ final String field = "field";
+
+ // the apriori index - used to build ngrams and to check if suggestions are any good.
+ IndexWriterFacade indexWriter = aprioriIndex.indexWriterFactory(analyzer, true);
+ addDocument(indexWriter, field, "heroes of might and magic III complete", Field.TermVector.WITH_POSITIONS_OFFSETS);
+ addDocument(indexWriter, field, "it might be the best game ever made", Field.TermVector.WITH_POSITIONS_OFFSETS);
+ addDocument(indexWriter, field, "forget about the rest", Field.TermVector.WITH_POSITIONS_OFFSETS);
+ addDocument(indexWriter, field, "it's all in the fame", Field.TermVector.WITH_POSITIONS_OFFSETS);
+ addDocument(indexWriter, field, "lost in translation", Field.TermVector.WITH_POSITIONS_OFFSETS);
+ addDocument(indexWriter, field, "on the little hill there is a little tree, never have I seen such a little tree", Field.TermVector.NO);
+ indexWriter.close();
+
+ IndexReader reader = aprioriIndex.indexReaderFactory();
+
+ // the single token suggester
+ IndexFacade ngramIndex = new DirectoryIndexFacade(new RAMDirectory());
+ ngramIndex.indexWriterFactory(null, true).close();
+
+ NgramTokenSuggester tokenSuggester = new NgramTokenSuggester(ngramIndex);
+ tokenSuggester.indexDictionary(new TermEnumIterator(reader, field), 2);
+
+ // the phrase suggester backed by single token suggester
+ TokenPhraseSuggester phraseSuggester = new TokenPhraseSuggesterImpl(tokenSuggester, field, false, 3, analyzer, aprioriIndex);
+
+ assertEquals("lost in translation", phraseSuggester.didYouMean("lost on translation"));
+ assertEquals("heroes might magic", phraseSuggester.didYouMean("magic light heros"));
+ assertEquals("heroes of might and magic", phraseSuggester.didYouMean("heros on light and magik"));
+ assertEquals("best game made", phraseSuggester.didYouMean("game best made"));
+ assertEquals("game made", phraseSuggester.didYouMean("made game"));
+ assertEquals("game made", phraseSuggester.didYouMean("made lame"));
+ assertEquals("the game", phraseSuggester.didYouMean("the game"));
+ assertEquals("in the fame", phraseSuggester.didYouMean("in the game"));
+ assertEquals("made", phraseSuggester.didYouMean("mede"));
+ assertEquals(0, phraseSuggester.suggest("may game", 3).size());
+
+ // make sure that non term positions act as they should
+
+ assertEquals("lost in translation", phraseSuggester.didYouMean("in lost translation"));
+ assertEquals("have never i seen", phraseSuggester.didYouMean("have nevrer i seen"));
+
+ reader.close();
+
+
+ }
+
+ private void addDocument(IndexWriterFacade indexWriter, String field, String text, Field.TermVector termVector) throws Exception {
+ Document document = new Document();
+ document.add(new Field(field, text, Field.Store.YES, Field.Index.TOKENIZED, termVector));
+ indexWriter.addDocument(document);
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/EditDistance.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/EditDistance.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/EditDistance.java (revision 0)
@@ -0,0 +1,52 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-17
+ * Time: 03:08:24
+ */
+public abstract class EditDistance {
+ protected final char[] sa;
+
+ public EditDistance(String sa) {
+ this.sa = sa.toCharArray();
+ }
+
+ // private static Log log = LogFactory.getLog(EditDistance.class);
+ // private static long serialVersionUID = 1l;
+
+ /**
+ * @param query string to measure distance to.
+ * @return distance to query
+ */
+ public abstract int getDistance(String query);
+
+ public double getNormalizedDistance(String query) {
+ return
+ 1.0d - ((double) getDistance(query) / Math
+ .min(query.length(), sa.length));
+
+ }
+
+ public char[] getSa() {
+ return sa;
+ }
+
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Trainer.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Trainer.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Trainer.java (revision 0)
@@ -0,0 +1,54 @@
+package org.apache.lucene.search.didyoumean;
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * The analyzing part of a session analyzed dictionary.
+ * It build and updates the dictionary based on what users did during their visit.
+ *
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 4:46:13 PM
+ */
+public interface Trainer {
+
+ /**
+ * @param dictionary the dictionary to update.
+ * @param goalTreeRoot the query goal tree to be used for training.
+ */
+ public abstract void trainGoalTree(Dictionary dictionary, QueryGoalNode goalTreeRoot) throws DatabaseException;
+
+
+// /**
+// * Places a goal tree in the queue.
+// *
+// * @param goalTreeRoot goal tree to be queued.
+// * @see org.apache.lucene.search.didyoumean.Trainer#flush(org.apache.lucene.search.didyoumean.dictionary.Dictionary)
+// */
+// public abstract void queueGoalTree(QueryGoalNode goalTreeRoot);
+//
+// /**
+// * Process all available goal trees in the queue.
+// *
+// * @param dictionary dictionary to work against
+// */
+// public abstract void flush(Dictionary dictionary) throws DatabaseException;
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySessionManager.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySessionManager.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySessionManager.java (revision 0)
@@ -0,0 +1,87 @@
+package org.apache.lucene.search.didyoumean;
+
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.Environment;
+import com.sleepycat.je.EnvironmentConfig;
+import com.sleepycat.persist.EntityStore;
+import com.sleepycat.persist.PrimaryIndex;
+import com.sleepycat.persist.StoreConfig;
+
+import java.io.File;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-okt-17
+ * Time: 18:42:42
+ */
+public class QuerySessionManager {
+
+ private EntityStore store;
+ private PrimaryIndex sessionsByID;
+
+
+ public QuerySessionManager(File bdbPath) throws DatabaseException {
+ if (!bdbPath.exists()) {
+ System.out.println("Creating path " + bdbPath);
+ bdbPath.mkdirs();
+ }
+
+ EnvironmentConfig envConfig = new EnvironmentConfig();
+ envConfig.setAllowCreate(true);
+ envConfig.setTransactional(false);
+ envConfig.setLocking(false);
+ Environment env = new Environment(bdbPath, envConfig);
+
+ StoreConfig storeConfig = new StoreConfig();
+ storeConfig.setAllowCreate(true);
+ storeConfig.setTransactional(false);
+ store = new EntityStore(env, "sessionManager", storeConfig);
+
+ sessionsByID = store.getPrimaryIndex(String.class, QuerySession.class);
+ }
+
+ public void close() throws DatabaseException {
+ store.close();
+ }
+
+ public EntityStore getStore() {
+ return store;
+ }
+
+ public PrimaryIndex getSessionsByID() {
+ return sessionsByID;
+ }
+
+ public synchronized QuerySession querySessionFactory() throws DatabaseException {
+ // ensure unique session id, todo better solution
+ try {
+ Thread.sleep(1);
+ } catch (InterruptedException ie) {
+ // whatever
+ }
+ return querySessionFactory(String.valueOf(System.currentTimeMillis()));
+ }
+
+ public QuerySession querySessionFactory(String id) throws DatabaseException {
+ QuerySession querySession = new QuerySession();
+ querySession.setId(id);
+ getSessionsByID().put(querySession);
+ return querySession;
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggester.java (revision 0)
@@ -0,0 +1,41 @@
+package org.apache.lucene.search.didyoumean;
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * A suggester is the entity that navigates a dictionary in search for a suggestion to a query.
+ *
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 4:46:57 PM
+ */
+public interface Suggester {
+
+ /**
+ * @param dictionary
+ * @param query
+ * @param n max number of suggestions @return
+ */
+ public abstract Suggestion[] didYouMean(Dictionary dictionary, String query, int n) throws DatabaseException;
+
+ public abstract String didYouMean(Dictionary dictionary, String query) throws DatabaseException ;
+
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySession.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySession.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QuerySession.java (revision 0)
@@ -0,0 +1,160 @@
+package org.apache.lucene.search.didyoumean;
+
+import com.sleepycat.persist.model.Entity;
+import com.sleepycat.persist.model.PrimaryKey;
+
+import java.util.ArrayList;
+import java.util.List;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-okt-17
+ * Time: 17:24:47
+ */
+@Entity
+public class QuerySession {
+
+ private static long defaultExpirationTimeMilliseconds = 1000 * 60 * 10; // 10 minutes
+
+ @PrimaryKey
+ private String id;
+
+ private long lastTouched = System.currentTimeMillis();
+ private long expirationTimeMilliseconds = defaultExpirationTimeMilliseconds;
+
+ private List> nodes = new ArrayList>();
+
+ /**
+ * Sets the last query node as parent, or null if no previous nodes.
+ *
+ * @param query user query
+ * @param corpusQueryResults number of hits
+ * @return query node index
+ */
+ public Integer query(String query, Integer corpusQueryResults) {
+ return query(query, corpusQueryResults, null);
+ }
+
+
+
+ /**
+ * Sets the last query node as parent, or null if no previous nodes.
+ *
+ * @param query user query
+ * @param corpusQueryResults number of hits
+ * @param suggestion suggestion to user query as given by suggester
+ * @return query node index
+ */
+ public Integer query(String query, Integer corpusQueryResults, String suggestion) {
+ return query(query, corpusQueryResults, suggestion, System.currentTimeMillis());
+ }
+
+ /**
+ * @param parentNodeIndex parent query
+ * @param query user query
+ * @param corpusQueryResults number of hits
+ * @param suggestion suggestion to user query as given by suggester
+ * @return query node index
+ */
+ public Integer query(Integer parentNodeIndex, String query, Integer corpusQueryResults, String suggestion) {
+ return query(parentNodeIndex, query, corpusQueryResults, suggestion, System.currentTimeMillis());
+ }
+
+ /**
+ * Sets the last query node as parent, or null if no previous nodes.
+ *
+ * @param query user query
+ * @param corpusQueryResults number of hits
+ * @param suggestion suggestion to user query as given by suggester
+ * @param timeStamp
+ * @return query node index
+ */
+ public Integer query(String query, Integer corpusQueryResults, String suggestion, Long timeStamp) {
+ return query(nodes.size() == 0 ? null : nodes.size() - 1, query, corpusQueryResults, suggestion, timeStamp);
+ }
+
+ /**
+ *
+ * @param parentNodeIndex parent query
+ * @param query user query
+ * @param corpusQueryResults number of hits
+ * @param suggestion suggestion to user query as given by suggester
+ * @param timeStamp
+ * @return query node index
+ */
+ public synchronized Integer query(Integer parentNodeIndex, String query, Integer corpusQueryResults, String suggestion, Long timeStamp) {
+ nodes.add(new QueryGoalNode(parentNodeIndex == null ? null : nodes.get(parentNodeIndex), query, corpusQueryResults, suggestion, timeStamp));
+ lastTouched = timeStamp;
+ return nodes.size() - 1;
+ }
+
+ public void inspect(int nodeIndex, R reference, double goalClassification) {
+ inspect(nodeIndex, reference, goalClassification, System.currentTimeMillis());
+ }
+
+ public void inspect(int nodeIndex, R reference, double goalClassification, Long timeStamp) {
+ lastTouched = timeStamp;
+ nodes.get(nodeIndex).new Inspection(reference, goalClassification, timeStamp);
+ }
+
+ public static long getDefaultExpirationTimeMilliseconds() {
+ return defaultExpirationTimeMilliseconds;
+ }
+
+ public static void setDefaultExpirationTimeMilliseconds(long defaultExpirationTimeMilliseconds) {
+ QuerySession.defaultExpirationTimeMilliseconds = defaultExpirationTimeMilliseconds;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public void setId(String id) {
+ this.id = id;
+ }
+
+ public long getExpirationTimeMilliseconds() {
+ return expirationTimeMilliseconds;
+ }
+
+ public void setExpirationTimeMilliseconds(long expirationTimeMilliseconds) {
+ this.expirationTimeMilliseconds = expirationTimeMilliseconds;
+ }
+
+
+ public List> getNodes() {
+ return nodes;
+ }
+
+ public void setNodes(List> nodes) {
+ this.nodes = nodes;
+ }
+
+
+ public long getLastTouched() {
+ return lastTouched;
+ }
+
+ public void setLastTouched(long lastTouched) {
+ this.lastTouched = lastTouched;
+ }
+
+ public boolean isExpired() {
+ return getLastTouched() + getExpirationTimeMilliseconds() < System.currentTimeMillis();
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionFacade.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionFacade.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionFacade.java (revision 0)
@@ -0,0 +1,375 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.index.facade.IndexFacadeFactory;
+import org.apache.lucene.index.facade.InstantiatedIndexFacade;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+import org.apache.lucene.search.didyoumean.impl.DefaultAprioriCorpusFactory;
+import org.apache.lucene.search.didyoumean.impl.DefaultQueryGoalTreeExtractor;
+import org.apache.lucene.search.didyoumean.impl.DefaultSuggester;
+import org.apache.lucene.search.didyoumean.impl.DefaultTrainer;
+import org.apache.lucene.search.didyoumean.secondlevel.token.MultiTokenSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.SecondLevelTokenPhraseSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.NgramTokenSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.ngram.TermEnumIterator;
+import org.apache.lucene.store.instantiated.InstantiatedIndex;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+/**
+ * Consumer interface for the adaptive user session analyzing suggester.
+ *
+ * The simplest implementation would look something like this:
+ *
+ *
+ * The trainer is fed with trees of {@link org.apache.lucene.search.didyoumean.QueryGoalNode} instances. Each such
+ * tree represent the events that took place while a user tried to find content within a certain context: a goal tree.
+ * An instance of {@link org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor} will help you to find and isolate
+ * all the goals in a tree representing a complete user session, as they sometimes contain more than one.
+ *
+ * It is up to the trainer and the suggester to decide how suggestions in the dictionary are stored and modified. Thus
+ * all trainers and suggesters might not be compatible with each other.
+ *
+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultQueryGoalTreeExtractor}
+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultTrainer}
+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultSuggester}
+ *
+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultAprioriCorpusFactory}
+ *
+ *
+ *
+ * @author Karl Wettin
+ * Date: 2007-feb-17
+ * Time: 02:32:50
+ */
+public class SuggestionFacade {
+
+ private Dictionary dictionary;
+ private Suggester suggester;
+
+ private QuerySessionManager querySessionManager;
+ private QueryGoalTreeExtractor queryGoalTreeExtractor;
+
+ private Trainer trainer;
+
+ private AprioriCorpusFactory aprioriCorpusFactory;
+
+ public SuggestionFacade(File dataRootPath) throws DatabaseException {
+ this(new File(dataRootPath, "dictionary"), new File(dataRootPath, "querySessionManager"));
+ }
+
+ public SuggestionFacade(File dictionaryPath, File querySessionManagerPath) throws DatabaseException {
+ this(dictionaryPath, querySessionManagerPath, new DefaultSuggester(), new DefaultTrainer(), new DefaultQueryGoalTreeExtractor(), new DefaultAprioriCorpusFactory());
+ }
+
+ public SuggestionFacade(File bdbRoot, Suggester suggester, Trainer trainer, QueryGoalTreeExtractor queryGoalTreeExtractor, AprioriCorpusFactory aprioriCorpusFactory) throws DatabaseException {
+ this(new File(bdbRoot, "dictionary"), new File(bdbRoot, "querySessionManager"), suggester, trainer, queryGoalTreeExtractor, aprioriCorpusFactory);
+ }
+
+
+ public SuggestionFacade(File dictionaryPath, File querySessionManagerPath, Suggester suggester, Trainer trainer, QueryGoalTreeExtractor queryGoalTreeExtractor, AprioriCorpusFactory aprioriCorpusFactory) throws DatabaseException {
+ this(new Dictionary(dictionaryPath), new QuerySessionManager(querySessionManagerPath), suggester, trainer, queryGoalTreeExtractor, aprioriCorpusFactory);
+ }
+
+ public SuggestionFacade(Dictionary dictionary, QuerySessionManager querySessionManager, Suggester suggester, Trainer trainer, QueryGoalTreeExtractor queryGoalTreeExtractor, AprioriCorpusFactory aprioriCorpusFactory) throws DatabaseException {
+ this.dictionary = dictionary;
+ this.querySessionManager = querySessionManager;
+ this.suggester = suggester;
+ this.trainer = trainer;
+ this.queryGoalTreeExtractor = queryGoalTreeExtractor;
+ this.aprioriCorpusFactory = aprioriCorpusFactory;
+
+ didYouMean("warmup"); // this is a bugfix
+ }
+
+
+ public void close() throws DatabaseException {
+ dictionary.close();
+ querySessionManager.close();
+ }
+
+ public Suggestion[] didYouMean(String query, int n) throws DatabaseException {
+ return getSuggester().didYouMean(getDictionary(), query, n);
+ }
+
+ public String didYouMean(String query) throws DatabaseException {
+ // todo remove debug
+ long ms = System.currentTimeMillis();
+ String ret = getSuggester().didYouMean(getDictionary(), query);
+ ms = System.currentTimeMillis() - ms;
+ System.out.println(ms + "ms\t " + query + " -> " + ret);
+ return ret;
+ }
+
+ /**
+ * Gathers and trains all expired query sessions from the query session manager
+ */
+ public synchronized void trainExpiredQuerySessions() throws DatabaseException {
+ trainExpiredQuerySessions(1);
+ }
+
+ /**
+ * Gathers and trains all expired query sessions from the query session manager
+ */
+ public synchronized void trainExpiredQuerySessions(int maxThreads) throws DatabaseException {
+ trainExpiredQuerySessions(maxThreads, 10000);
+ }
+
+ /**
+ * Gathers and trains all expired query sessions from the query session manager
+ */
+ public synchronized void trainExpiredQuerySessions(int maxThreads, int batchSize) throws DatabaseException {
+ int count = 0;
+ for (; ;) {
+
+ final ConcurrentLinkedQueue> queue = new ConcurrentLinkedQueue>();
+
+ for (QuerySession querySession : getQuerySessionManager().getSessionsByID().map().values()) {
+ if (querySession.isExpired()) {
+ count++;
+ queue.add((QuerySession) querySession);
+ if (queue.size() == batchSize) {
+ break;
+ }
+ }
+ }
+
+ if (queue.size() == 0) {
+ break;
+ }
+
+ Thread[] threads = new Thread[maxThreads];
+ for (int i = 0; i < threads.length; i++) {
+ threads[i] = new Thread(new Runnable() {
+ public void run() {
+ QuerySession session;
+ while ((session = queue.poll()) != null) {
+ try {
+ trainSession(session);
+ getQuerySessionManager().getSessionsByID().delete(session.getId());
+ } catch (DatabaseException dbe) {
+ dbe.printStackTrace();
+ // todo
+ }
+ }
+ }
+ });
+ threads[i].start();
+ }
+ for (Thread thread : threads) {
+ try {
+ thread.join();
+ } catch (InterruptedException ie) {
+ ie.printStackTrace();
+ // todo
+ }
+ }
+
+ System.out.println(count + " sessions trained.");
+ }
+
+ System.out.println("Finished training a total of " + count + " sessions.");
+
+ }
+
+ /**
+ * Extracts multiple goal trees from a tree containing all queries in a session,
+ * and queues each such goal tree to the trainer.
+ *
+ * @param session the session to be trained
+ * @throws DatabaseException
+ */
+ public void trainSession(QuerySession session) throws DatabaseException {
+ trainSessionQueryTree(session.getNodes().get(0));
+ }
+
+ /**
+ * Extracts multiple goal trees from a tree containing all queries in a session,
+ * and queues each such goal tree to the trainer.
+ *
+ * @param session any node (preferably the root) of a query goal tree
+ * @throws DatabaseException
+ */
+ public void trainSessionQueryTree(QueryGoalNode session) throws DatabaseException {
+ for (QueryGoalNode goalTreeRoot : getQueryGoalTreeExtractor().extractGoalRoots(session.getRoot())) {
+ getTrainer().trainGoalTree(getDictionary(), goalTreeRoot);
+ }
+ }
+
+
+ /**
+ * Compiles algorithmic second level suggesters based on the data in the dictionary.
+ *
+ * @return
+ * @throws IOException
+ * @throws DatabaseException
+ */
+ public Map secondLevelSuggestionFactory() throws IOException, DatabaseException {
+ return secondLevelSuggestionFactory(
+ new IndexFacadeFactory() {
+ public IndexFacade factory() throws IOException {
+ return new InstantiatedIndexFacade(new InstantiatedIndex());
+ }
+ }
+ );
+ }
+
+ /**
+ * Compiles algorithmic second level suggesters based on the data in the dictionary.
+ *
+= * @param indexFacadeFactory index in which to store ngrams created by all terms in the a priori corpus
+ * @return a second level suggester
+ * @throws IOException
+ * @throws DatabaseException
+ */
+ public Map secondLevelSuggestionFactory(IndexFacadeFactory indexFacadeFactory) throws IOException, DatabaseException {
+ return secondLevelSuggestionFactory(null, null, null, indexFacadeFactory, "apriori", indexFacadeFactory, 2, 7);
+ }
+
+ /**
+ * Compiles algorithmic second level suggesters based on the data in the dictionary.
+ *
+ * @param systemIndex system index user queries are placed in. optional.
+ * @param systemIndexField field in system index used to extract ngram tokens. must not be null if systemIndex is available.
+ * @param systemNgramIndexFacadeFactory
+ * @param aprioriIndexFacadeFactory index in which to store the created a priori corpus
+ * @param aprioriField field in a priori index to store values
+ * @param aprioriNgramIndexFacadeFactory index in which to store ngrams created by all terms in the a priori corpus
+ * @param minNgramSize minimum ngram size. 2 makes sense.
+ * @param maxSuggestionsPerWord maximum number of suggestions per word in matrix. A maximum of n^w queries will be placed.
+ * @return a second level suggester
+ * @throws IOException
+ * @throws DatabaseException
+ */
+ public Map secondLevelSuggestionFactory(IndexFacade systemIndex, String systemIndexField, IndexFacadeFactory systemNgramIndexFacadeFactory, IndexFacadeFactory aprioriIndexFacadeFactory, String aprioriField, IndexFacadeFactory aprioriNgramIndexFacadeFactory, int minNgramSize, int maxSuggestionsPerWord) throws IOException, DatabaseException {
+ if (systemIndex != null && systemIndexField == null) {
+ throw new NullPointerException("systemIndexField must be set if systemIndex is present.");
+ }
+
+ Map ret = new HashMap(2);
+
+ Analyzer analyzer = new StandardAnalyzer(Collections.emptySet());
+
+
+ System.out.println("Creating a priori corpus...");
+ IndexFacade aprioriIndex = aprioriIndexFacadeFactory.factory();
+
+ getAprioriCorpusFactory().factory(getDictionary(), getSuggester(), aprioriIndex, aprioriField, analyzer);
+
+ System.out.println("Creating ngram index from a priori corpus terms...");
+ IndexFacade aprioriNgramIndex = aprioriNgramIndexFacadeFactory.factory();
+ aprioriNgramIndex.indexWriterFactory(null, true).close(); // reset
+ NgramTokenSuggester ngramTokenSuggester = new NgramTokenSuggester(aprioriNgramIndex);
+ IndexReader aprioriIndexReader = aprioriIndex.indexReaderFactory();
+ ngramTokenSuggester.indexDictionary(new TermEnumIterator(aprioriIndexReader, aprioriField), minNgramSize);
+ aprioriIndexReader.close();
+
+ ret.put(new SecondLevelTokenPhraseSuggester(ngramTokenSuggester, aprioriField, false, maxSuggestionsPerWord, analyzer, aprioriIndex), 3d);
+
+ if (systemIndex != null) {
+ System.out.println("Creating ngram index from system corpus terms...");
+ IndexFacade systemNgramIndex = systemNgramIndexFacadeFactory.factory();
+ systemNgramIndex.indexWriterFactory(null, true).close(); // reset
+ NgramTokenSuggester sysetmNgramTokenSuggester = new NgramTokenSuggester(systemNgramIndex);
+ IndexReader systemIndexReader = systemIndex.indexReaderFactory();
+ ngramTokenSuggester.indexDictionary(new TermEnumIterator(systemIndexReader, systemIndexField), minNgramSize);
+ systemIndexReader.close();
+
+ ret.put(new MultiTokenSuggester(sysetmNgramTokenSuggester, systemIndexField, true, maxSuggestionsPerWord, analyzer, systemIndex), 1d);
+ }
+
+ return ret;
+ }
+
+
+ public Dictionary getDictionary() {
+ return dictionary;
+ }
+
+ public void setDictionary(Dictionary dictionary) {
+ this.dictionary = dictionary;
+ }
+
+ public Suggester getSuggester() {
+ return suggester;
+ }
+
+ public void setSuggester(Suggester suggester) {
+ this.suggester = suggester;
+ }
+
+ public Trainer getTrainer() {
+ return trainer;
+ }
+
+ public void setTrainer(Trainer trainer) {
+ this.trainer = trainer;
+ }
+
+ public QueryGoalTreeExtractor getQueryGoalTreeExtractor() {
+ return queryGoalTreeExtractor;
+ }
+
+ public void setQueryGoalTreeExtractor(QueryGoalTreeExtractor queryGoalTreeExtractor) {
+ this.queryGoalTreeExtractor = queryGoalTreeExtractor;
+ }
+
+
+ public QuerySessionManager getQuerySessionManager() {
+ return querySessionManager;
+ }
+
+ public void setQuerySessionManager(QuerySessionManager querySessionManager) {
+ this.querySessionManager = querySessionManager;
+ }
+
+
+ public AprioriCorpusFactory getAprioriCorpusFactory() {
+ return aprioriCorpusFactory;
+ }
+
+ public void setAprioriCorpusFactory(AprioriCorpusFactory aprioriCorpusFactory) {
+ this.aprioriCorpusFactory = aprioriCorpusFactory;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalNode.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalNode.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalNode.java (revision 0)
@@ -0,0 +1,287 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import com.sleepycat.persist.model.Persistent;
+
+import java.io.Serializable;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ArrayList;
+import java.util.concurrent.CopyOnWriteArrayList;
+
+/**
+ * A node in a tree describing what queries, suggestions and inspection of results
+ * that a user undertook while searching for a specific thing.
+ *
+ * @author Karl Wettin
+ * Date: 2007-jan-31
+ * Time: 06:23:42
+ */
+@Persistent
+public class QueryGoalNode {
+
+ private QueryGoalNode parent;
+ private List> children = new ArrayList>();
+
+ /** time of query */
+ private Long timestamp;
+ /** user query */
+ private String query;
+ /** number of hits in corpus from query */
+ private Integer corpusQueryResults;
+ /** the suggested text from the dictonary */
+ private String suggestion;
+
+ /** corpus query results inspected by user */
+ private List inspections = new LinkedList();
+
+ /** bdb persistence */
+ private QueryGoalNode() {
+ }
+
+ public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults) {
+ this(parentNode, query, corpusQueryResults, null, System.currentTimeMillis());
+ }
+
+ public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults, Long timeStamp) {
+ this(parentNode, query, corpusQueryResults, null, timeStamp);
+ }
+
+
+ public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults, String suggestion) {
+ this(parentNode, query, corpusQueryResults, suggestion, System.currentTimeMillis());
+ }
+
+ public QueryGoalNode(QueryGoalNode parentNode, String query, Integer corpusQueryResults, String suggestion, Long timeStamp) {
+ this.parent = parentNode;
+ if (parentNode != null) {
+ parentNode.getChildren().add(this);
+ }
+ this.query = query;
+ this.corpusQueryResults = corpusQueryResults;
+ this.suggestion = suggestion;
+ this.timestamp = timeStamp;
+ }
+
+ public Double calculateInspectionWeight() {
+ if (getInspections().size() == 0) {
+ return null;
+ }
+ double score = 1d;
+ for (Inspection inspection : getInspections()) {
+ score *= 1 + inspection.getGoalClassification();
+ }
+ return score;
+ }
+
+ public QueryGoalNode getRoot() {
+ QueryGoalNode node = this;
+ while (node.getParent() != null) {
+ node = node.getParent();
+ }
+ return node;
+ }
+
+
+ public QueryGoalNode getParent() {
+ return parent;
+ }
+
+ public void setParent(QueryGoalNode parent) {
+ this.parent = parent;
+ }
+
+ public List> getChildren() {
+ return children;
+ }
+
+ public void setChildren(List> children) {
+ this.children = children;
+ }
+
+ public String getQuery() {
+ return query;
+ }
+
+ public void setQuery(String query) {
+ this.query = query;
+ }
+
+ public String getSuggestion() {
+ return suggestion;
+ }
+
+ public void setSuggestion(String suggestion) {
+ this.suggestion = suggestion;
+ }
+
+ public Long getTimestamp() {
+ return timestamp;
+ }
+
+ public void setTimestamp(Long timestamp) {
+ this.timestamp = timestamp;
+ }
+
+
+ public Integer getcorpusQueryResults() {
+ return corpusQueryResults;
+ }
+
+ public void setcorpusQueryResults(Integer corpusQueryResults) {
+ this.corpusQueryResults = corpusQueryResults;
+ }
+
+ public List getInspections() {
+ return inspections;
+ }
+
+ public void setInspections(List inspections) {
+ this.inspections = inspections;
+ }
+
+ /**
+ * The user was really looking for Windows vista, but searched only for vista.
+ * Buena vista social club was found, but as this subject also was compelling
+ * the user went on and read about that instead.
+ */
+ public static final Double NO_PART_OF_THE_GOAL = -1d;
+
+ /**
+ * We have no clue to wether the inspected reference was a part of the goal or not.
+ */
+ public static final Double MOO = 0d;
+
+ /**
+ * To set this high value, the users should explicitally have told the system
+ * that they found just the thing they were looking for.
+ */
+ public static final Double GOAL = 1d;
+
+ @Persistent
+ public class Inspection {
+ /** reference to the inspected item. */
+ private R reference;
+ /** time of inspection */
+ private Long timeStamp;
+ /** wether or not this inspection was the goal of the query. */
+ private double goalClassification;
+
+ public Inspection(R reference, double goalClassification) {
+ this(reference, goalClassification, System.currentTimeMillis());
+ }
+
+ public Inspection(R reference, double goalClassification, Long timeStamp) {
+ this.reference = reference;
+ this.goalClassification = goalClassification;
+ this.timeStamp = timeStamp;
+ QueryGoalNode.this.getInspections().add(this);
+ }
+
+
+ public R getReference() {
+ return reference;
+ }
+
+ public void setReference(R reference) {
+ this.reference = reference;
+ }
+
+ public Long getTimeStamp() {
+ return timeStamp;
+ }
+
+ public void setTimeStamp(Long timeStamp) {
+ this.timeStamp = timeStamp;
+ }
+
+ public double getGoalClassification() {
+ return goalClassification;
+ }
+
+ public void setGoalClassification(double goalClassification) {
+ this.goalClassification = goalClassification;
+ }
+ }
+
+ public int numChildrenRecursive() {
+ int i = 0;
+ Iterator> it = iterateChildrenRecursive();
+ while (it.hasNext()) {
+ it.next();
+ i++;
+ }
+ return i;
+ }
+
+ public Iterator> iterateChildrenRecursive() {
+ return new Iterator>() {
+
+ Iterator> children = getChildren().iterator();
+ Iterator> subChildren;
+
+ Iterator> currentChildren;
+
+ QueryGoalNode currentChild;
+ QueryGoalNode nextChild;
+
+ public boolean hasNext() {
+
+ if (nextChild != null) {
+ return true;
+ }
+
+ while (true) {
+ if (subChildren == null) {
+ if (children.hasNext()) {
+ nextChild = children.next();
+ subChildren = nextChild.iterateChildrenRecursive();
+ currentChildren = children; // for remove
+ return true;
+ } else {
+ return false;
+ }
+ } else if (subChildren.hasNext()) {
+ nextChild = subChildren.next();
+ currentChildren = subChildren; // for remove
+ return true;
+ } else {
+ subChildren = null;
+ }
+ }
+ }
+
+ public QueryGoalNode next() {
+ hasNext();
+ currentChild = nextChild;
+ nextChild = null;
+ return currentChild;
+ }
+
+ public void remove() {
+ currentChildren.remove();
+ }
+ };
+ }
+
+ public String toString() {
+ return getQuery() + " " + getcorpusQueryResults() + " " + getInspections().size();
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/package.html
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/package.html (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/package.html (revision 0)
@@ -0,0 +1,318 @@
+
+
+
+
+
+
+
+
+
Abstract
+
+
+ A dictionary of correctly and incorrectly spelled words
+ pointing at one or many weighted suggestions
+ created from previous user activity
+ and backed by algorithmic suggestions.
+
+
+
Consumer interface
+
+
The simplest implementation would look something like this:
Usage can be much more complex than that and would then yeild in better suggestions.
+
+
Rationale
+
+
+ These are the results from contemplating on the relationships between
+
+
the user a priori (what the users knows to be right)
+
user queries (what the user is searching for in the system corpus) and how they are changed during a session in
+ quest for one or many specific goals
+
+
the results a user inspected and perhaps even classified by taking further actions
+
the system corpus (the information available to search in)
+
and the system a priori (things the system knows to be right, probably a part of the system corpus you trust to
+ be right. E.g. movie titles, but not user comments.)
+
+
+
+
+
+ As a user progress in their search for data in a corpus,
+ by supplying queries,
+ accepting and decline suggestions,
+ manually changing queries
+ and inspect results in different ways,
+ the system tries to find the goals of the event tree
+ and suggest these goals to future users with similair behaviour.
+
+
+
It is not completly wrong to call this a semi-retarded reinforcement learning strategy.
+
+
+ When a user requests a suggestion to a query prior unknown to the dictonary,
+ second level suggesters will attempt to come up with something.
+ These are usually algorithmic and not aware of the dictionary.
+
+
+
+ In a young implementation, or in one with very little user load,
+ the dictionary will more or less act as a cache against the second level suggesters.
+
+
+
+ Spell corrections, synonyms and acronyms
+ are the most common effects I have seen from an active adaptative suggester.
+
+
+
+ As all queries ever placed are stored in the dictionary (until pruned)
+ and the dictionary key is a whitespace and punctuation free version of the query,
+ any decomposited or composited typo will be handled.
+ For instance, "leonardo da vinci" will get the key "leonardodavinci" and thus suggest
+ "leonardo da vinci" in case the input query is "leonardo davinci".
+
+
+
Query goal trees
+
+
+ A user session could contain multiple quests for content.
+ For example:
+ first the user looks for the Apache licence,
+ spells it wrong, inspects different results,
+ and then the user searches for the author Ivan Goncharov.
+
+
+
+ In this package they are called different query goals.
+
+
+
+ It is up to the QueryGoalTreeExtractor implementations to decide what
+ events in a session are parts of the same goal,
+ as we don't want to suggest the user to check out Goncharov
+ when they are looking for the Apache license.
+
+
+
+ In the default query goal tree extractor,
+ nodes are parts of the same goal as their parent when:
+
+
The queries are the same.
+
The user took a suggestion from the system.
+
The current and the parent queries are similair enough.
+
The queries was entered within short enough time.
+
+
+
+ User activities are kept track of in a tree of QueryGoalNode:s,
+ where each node contains a user query,
+ if the current query (node) was a suggestion to a previous user query (node),
+ what search results was further inspected,
+ when it happend,
+ and for how long.
+
+
+
+ Figure A.
+
+
+ User first look for "heroes of knight and magic",
+ followed by "heroes of light and magic",
+ goes back to the first query (knight) and changes that to "heroes of night and magic",
+ followed by "heroes of might and magic".
+ The user adds a comment in the third hit
+ and then inspects the first hit,
+ see that "homm" is a common acronym and searches for that too.
+ Inspects two hits in homm
+
+
+
+ Figure B.
+
+
+
+
+ The tree is sent to a trainer that in this case guesses
+ "heroes of might and magic" and "homm" seems to be what the user was looking for.
+ All non-inspected results are then adapted to suggest the most similar goal.
+ The two final goals are adapted to suggest each other.
+
+
+
+
Adaptive training
+
+
+ Adaptive means that the suggestions to a query
+ depends on how users previously have been acting.
+ This means that the dictionary could be tampered with quite easy by hammering the trainer with data
+ and you should therefore try to train only with data from trusted users.
+
+
+
+ The default trainer implementation works like this:
+
+
If a user accepts the suggestion made by the system, then the score for that suggestion is incresed.
+ (positive adaptation)
+
+
If a user does not accept the suggestion made by the system, then the score for that suggestion is decreased.
+ (negative adaptation)
+
+
+ If the goal tree is a single query (perhaps with multiple inspections)
+ then adapt negative on suggestion once again.
+
+
+ Suggestions are the queries with inspections, ordered by the classification weight.
+ All the queries in the goal without inspections will be adpated positive with
+ the query with inspections that has the shortest edit distance.
+
+
Suggests back from best goal to second best goal. homm -> heroes of might and magic -> homm
+
+
+
+
Suggesting
+
+
+ Suggestions are created by the suggester, that navigates a dictionary.
+ The default implementation works like this:
+
+
+ Returns highest scoring suggestion available,
+ unless the score is lower than the suggestion supression threadshold.
+
+
+ If there are no suggestions available, the second level suggesters
+ registred to the dictionary are used to produce the suggestions.
+
+
+ If the top scoring suggestion is same as the query,
+ and the second best is not supressed below threadshold,
+ switch the top two suggestions. (This should probably be a setting.)
+
+
+ Ignoring a suggestion 50 times or so with a DefaultTrainer
+ supresses a suggestions below the default threadshold.
+ If a user manually change a query to a suppresed value
+ and thus be suggested again if no other suggestion is considered more important.
+
+
+
Second level suggesting
+
+
+ If the dictionary does not contain a suggestion for a given query,
+ it will be passed on to any available SecondLevelSuggester,
+ usually an algorithmic suggestion scheme
+ that hopefully can come up with something.
+ When a user accepts such a suggestion
+ it will be picked up by the adaptive trainer
+ and become a part of the dictionary.
+
+
+
A priori corpus dependency
+
+
+ In order for second level suggesters to work they need a corpus that is known to be correct:
+ an a priori corpus. This ought to be a small index that only contain text people do not know how
+ to spell, i.e. it only consume clock ticks at suggestion time if it is filled with phrases
+ people always get right.
+
+
+
A priori corpus extraction
+
+
+ By inverting the dictionary and extracting the tokens or phrases people have most problem spelling,
+ a pretty good a priori corpus can be created. This will however require a dictionary that has been
+ exposed to quite some user activity.
+
+
+
Token suggesters
+
+
+ The lowest level of suggestion is single token suggestions,
+ and the default implementation is a refactor of the contrib/spellcheck.
+
+
+
NgramTokenSuggester
+
+
A refactor of the Lucene crontrib/spellcheck single token ngram suggester
+
+
TokenPhraseSuggester
+
+
+ Combines the output from a single token suggester, applied to each token (word) in a phrase,
+ to create possible phrases. In essense this is the same thing as a SpanFuzzyQuery.
+
+
+
+ For example, the user places the query "thh best game".
+ The matrix of similar tokens are:
+
+ the best game
+ tho rest fame
+ lame
+
+ These can be combined as:
+
+ tho best game
+ tho best fame
+ tho best lame
+ tho rest game
+ tho rest fame
+ tho rest lame
+ the best game
+ the best fame
+ the best lame
+ the rest game
+ the rest fame
+ the rest lame
+
+ A query is created for each combination to find valid suggestions in the a priori index.
+ In the default implementation SpanNearQueries are used.
+
+
+ If any of the valid hits contains a TermPositionVector
+ it will be analyzed and suggest the query in the order of terms in the corpus.
+
+
+
+ E.g. query "camel that broke the staw" is suggested with "straw that broke the camel"
+
+
+
+ todo: if term offsets available and value stored, suggest that value (for cosmetic reasons).
+
+
+
MultiTokenSuggester
+
+
+ Works similar to the TokenPhraseSuggester, but uses term queries rather than span near queries,
+ used when people place queries where the terms within the query really has nothing to do with each other.
+ This second level suggester should have a lower priority than the TokenPhraseSuggester.
+
+
+
+
+
+
+
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalJuror.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalJuror.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalJuror.java (revision 0)
@@ -0,0 +1,78 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.search.didyoumean.QueryGoalNode;
+
+import java.util.*;
+
+/**
+ * Sets the chronologycally last placed query as the goal.
+ *
+ * @see QueryGoalJuror
+ * @author Karl Wettin
+ * Date: 2007-feb-23
+ * Time: 23:53:21
+ */
+public class DefaultQueryGoalJuror implements QueryGoalJuror {
+
+ // private static Log log = LogFactory.getLog(HitsBasedJuror.class);
+ // private static long serialVersionUID = 1l;
+
+ /**
+ * Sets the chronologycally last placed query as the goal.
+ * @param goalTreeRootNode Query goal tree root
+ * @return the goal nodes
+ * @throws RuntimeException If query goal tree already contains one or more goals.
+ */
+ public List> createGoals(QueryGoalNode goalTreeRootNode) {
+
+ List> nodes = new ArrayList>();
+
+ for (Iterator> children = goalTreeRootNode.iterateChildrenRecursive(); children.hasNext();) {
+ QueryGoalNode node = children.next();
+ for (QueryGoalNode.Inspection inspection : node.getInspections()) {
+ if (inspection.getGoalClassification() > QueryGoalNode.MOO) {
+ throw new RuntimeException("The query goal tree already contains one or more goals!");
+ }
+ }
+ nodes.add(node);
+ }
+ nodes.add(goalTreeRootNode);
+
+
+ Collections.sort(nodes, new Comparator>() {
+ public int compare(QueryGoalNode queryGoalNode, QueryGoalNode queryGoalNode1) {
+ return queryGoalNode1.getTimestamp().compareTo(queryGoalNode.getTimestamp());
+ }
+
+ });
+
+ List> goalNodes = new LinkedList>();
+
+ if (nodes.get(0).getcorpusQueryResults() > 0) {
+
+ nodes.get(0).new Inspection(null, QueryGoalNode.GOAL);
+ goalNodes.add(nodes.get(0));
+ }
+
+ return goalNodes;
+
+
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalTreeExtractor.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalTreeExtractor.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultQueryGoalTreeExtractor.java (revision 0)
@@ -0,0 +1,240 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.search.didyoumean.EditDistance;
+import org.apache.lucene.search.didyoumean.Levenshtein;
+import org.apache.lucene.search.didyoumean.QueryGoalNode;
+
+import java.io.Serializable;
+import java.util.*;
+import java.util.concurrent.ConcurrentLinkedQueue;
+
+/**
+ *
+ * A not too advanced way to extract the goals from a session.
+ *
+ * Nodes are parts of the same goal as their parent when:
+ *
+ *
the queries are the same
+ *
the suggestion to the parent query was followed
+ *
the queries are similair enough
+ *
the queries was entered within short enough time
+ *
+ *
+ * @author Karl Wettin
+ * Date: Aug 1, 2006
+ * Time: 5:01:14 PM
+ */
+public class DefaultQueryGoalTreeExtractor implements org.apache.lucene.search.didyoumean.QueryGoalTreeExtractor, Serializable {
+
+ private static final long serialVersionUID = 1l;
+
+ public static final long DEFAULT_MAXIMUM_MILLISECONDS_BETWEEN_QUERIES = 20000;
+ public static final double DEFAULT_MINIMUM_SIMILARITY = 0.6f;
+ public static final int DEFAULT_MINIMUM_HITS_IN_FINAL_QUERY = 1;
+ public static final double DEFAULT_MINIMUM_HITS_RATIO = 0f;
+
+
+ private long maximumMillisecondsBetweenQueries = DEFAULT_MAXIMUM_MILLISECONDS_BETWEEN_QUERIES;
+ private double minimumSimilarity = DEFAULT_MINIMUM_SIMILARITY;
+ private int minimumHitsInFinalQuery = DEFAULT_MINIMUM_HITS_IN_FINAL_QUERY;
+ private double minimumHitsRatio = DEFAULT_MINIMUM_HITS_RATIO;
+
+ private double inspectionWeightGoalThreadshold = 0d;
+
+ private class GoalFactory {
+ private List> goalQueries = new LinkedList>();
+ }
+
+ public boolean isPartOfParentGoal(QueryGoalNode child) {
+ if (child.getParent() == null) {
+ // no parent
+ return false;
+ } else if (child.getQuery().equals(child.getParent().getSuggestion())) {
+ // the user followed a suggestion made by the system.
+ return true;
+ } else if (child.getQuery().equals(child.getParent().getQuery())) {
+ // it is the same query placed once more.
+ // todo: consider if this should have been merged when creating a new child in DefaultTrainer?
+ return true;
+ } else {
+ // check edit distance
+
+ EditDistance editDistance = editDistanceFactory(child.getParent().getQuery());
+ //double similarity = ((double) distance / previousLink.link.length());
+ double similarity = 1.0f - ((double) editDistance.getDistance(child.getQuery()) / Math.min(child.getQuery().length(), child.getParent().getQuery().length()));
+ if (similarity >= getMinimumSimilarity()) {
+ // similair enough
+ return true;
+ } else if (child.getTimestamp() - child.getParent().getTimestamp() < getMaximumMillisecondsBetweenQueries()) {
+ // todo: consider time spent inspecting et c.
+
+ // not similair enough.
+ // but fast enough between searches.
+
+ // if the child or its children reached the goal // todo recursive
+ // then it is a part of the parent goal.
+
+ Double weight = child.calculateInspectionWeight();
+ if (weight == null) {
+ return false;
+ } else {
+ return weight > getInspectionWeightGoalThreadshold();
+ }
+
+ }
+ }
+
+ return false;
+ }
+
+ public EditDistance editDistanceFactory(String sd) {
+ return new Levenshtein(sd);
+ }
+
+
+ public List> extractGoalRoots(QueryGoalNode sessionRoot) {
+
+ List> goalRoots = new LinkedList>();
+
+ Map, GoalFactory> goalFactoryByNodes = new HashMap, GoalFactory>();
+
+
+ Queue> leafsQueue = new ConcurrentLinkedQueue>();
+ for (Iterator> leafsIterator = sessionRoot.iterateChildrenRecursive(); leafsIterator.hasNext();) {
+ QueryGoalNode node = leafsIterator.next();
+ if (node.getChildren().size() == 0) {
+ leafsQueue.add(node);
+ }
+ }
+
+ if (leafsQueue.size() == 0) {
+ // this is just a single node.
+ goalRoots.add(sessionRoot);
+ } else while (leafsQueue.size() > 0) {
+ QueryGoalNode node = leafsQueue.poll();
+ while (isPartOfParentGoal(node)) {
+ GoalFactory nodeGoalFactory = goalFactoryByNodes.get(node);
+ if (nodeGoalFactory == null) {
+ nodeGoalFactory = new GoalFactory();
+ nodeGoalFactory.goalQueries.add(node);
+ goalFactoryByNodes.put(node, nodeGoalFactory);
+ }
+
+ if (node.getParent() != null) {
+ GoalFactory parentGoalFactory = goalFactoryByNodes.get(node.getParent());
+ if (parentGoalFactory != null) {
+ // the parent has already been added to a goal factory
+ // merge this goal factory with the parent goal factory.
+ for (QueryGoalNode childGoalNode : goalFactoryByNodes.get(node).goalQueries) {
+ goalFactoryByNodes.put(childGoalNode, parentGoalFactory);
+ parentGoalFactory.goalQueries.add(childGoalNode);
+ }
+ } else {
+ goalFactoryByNodes.put(node, nodeGoalFactory);
+ nodeGoalFactory.goalQueries.add(node.getParent());
+ }
+ }
+
+ node = node.getParent();
+ }
+
+ // make current node a new root
+ if (node.getParent() != null) {
+ leafsQueue.add(node.getParent()); // parent is a new goal leaf!
+ node.getParent().getChildren().remove(node);
+ node.setParent(null);
+ }
+ goalRoots.add(node);
+ }
+
+ return goalRoots;
+ }
+
+ /**
+ * @return maximun time in milliseconds between two queries to be considered part of the same correction sequence
+ */
+ public final long getMaximumMillisecondsBetweenQueries() {
+ return maximumMillisecondsBetweenQueries;
+ }
+
+ /**
+ * @param maximumMillisecondsBetweenQueries
+ * maximun time in milliseconds between two queries to be considered part of the same correction sequence
+ */
+ public final void setMaximumMillisecondsBetweenQueries(long maximumMillisecondsBetweenQueries) {
+ this.maximumMillisecondsBetweenQueries = maximumMillisecondsBetweenQueries;
+ }
+
+ /**
+ * @return minimum edit-distance between two queries in order to be considered part of the same correction sequence
+ */
+ public final double getMinimumSimilarity() {
+ return minimumSimilarity;
+ }
+
+ /**
+ * Using Levenshtein edit distance factory,
+ * anything less than 0.5 is crazy,
+ * 0.7 can be almost be considered fail-safe,
+ * but as this is an adapting spell checker some bad data is ok,
+ * so I set default to 0.6
+ *
+ * @param minimumSimilarity minimum edit-distance between two queries in order to be considered part of the same correction sequence
+ */
+ public final void setMinimumSimilarity(double minimumSimilarity) {
+ this.minimumSimilarity = minimumSimilarity;
+ }
+
+ /**
+ * @return number of hits required in the final query of a correction sequence
+ */
+ public final int getMinimumHitsInFinalQuery() {
+ return minimumHitsInFinalQuery;
+ }
+
+ /**
+ * @param minimumHitsInFinalQuery number of hits required in the final query of a correction sequence
+ */
+ public final void setMinimumHitsInFinalQuery(int minimumHitsInFinalQuery) {
+ this.minimumHitsInFinalQuery = minimumHitsInFinalQuery;
+ }
+
+ /**
+ * @return minimum allowed ratio of the low and top number of hits of any two queries in a correction sequence. default to 2, that is twice as many hits in the top resulting query is required
+ */
+ public final double getMinimumHitsRatio() {
+ return minimumHitsRatio;
+ }
+
+ /**
+ * @param minimumHitsRatio minimum allowed ratio of the low and top number of hits of any two queries in a correction sequence. default to 2, that is twice as many hits in the top resulting query is required
+ */
+ public final void setMinimumHitsRatio(double minimumHitsRatio) {
+ this.minimumHitsRatio = minimumHitsRatio;
+ }
+
+
+ public double getInspectionWeightGoalThreadshold() {
+ return inspectionWeightGoalThreadshold;
+ }
+
+ public void setInspectionWeightGoalThreadshold(double inspectionWeightGoalThreadshold) {
+ this.inspectionWeightGoalThreadshold = inspectionWeightGoalThreadshold;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultTrainer.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultTrainer.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultTrainer.java (revision 0)
@@ -0,0 +1,321 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.search.didyoumean.*;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+import org.apache.lucene.search.didyoumean.dictionary.SuggestionList;
+
+import java.util.*;
+
+import com.sleepycat.je.DatabaseException;
+
+/**
+ * A simple adapting suggestion strategy that updates the content of a dictionary based on what a query session looked like.
+ *
+ * If a user accepts the suggestion made by the system, then we increase the score for that suggestion. (positive adaptation)
+ * If a user does not accept the suggestion made by the system, then we decrease the score for that suggestion. (negative adaptation)
+ *
+ * The the goal tree is a single query, one query only (perhaps with multiple inspections)
+ * then we adapt negative once again.
+ *
+ * Suggestions are the query with inspections, ordered by the classification weight.
+ * All the queries in the goal witout inspections will be adpated positive with
+ * the query with inspections that has the shortest edit distance.
+ *
+ * Suggest back from best goal to second best goal. homm -> heroes of might and magic -> homm
+ *
+ * In case of no inspections in a goal tree, a {@link org.apache.lucene.search.didyoumean.impl.QueryGoalJuror} is used to judge what is the goal.
+ *
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 4:56:13 PM
+ */
+public class DefaultTrainer implements Trainer {
+
+ private boolean trainingFinalGoalAsSelf = true;
+
+ private double notSuggestedPositiveAdaptationFactor = 1.4d;
+ private double acceptedSuggestionPositiveAdaptationFactor = 1.4d;
+ private double ignoredSuggestionNegativeAdaptationFactor = 0.9d;
+
+ private QueryGoalJuror juror = new DefaultQueryGoalJuror();
+
+ private static Comparator scoreComparator = new Comparator() {
+ public int compare(Suggestion o1, Suggestion o2) {
+ return Double.compare(o2.getScore(), o1.getScore());
+ }
+ };
+
+
+ public void trainGoalTree(Dictionary dictionary, QueryGoalNode goalTreeRoot) throws DatabaseException {
+
+ int numChildrenRecursive = 0;
+ // positive and negative adaptation of suggestion scores
+ for (Iterator> it = goalTreeRoot.iterateChildrenRecursive(); it.hasNext();) {
+ numChildrenRecursive++;
+ QueryGoalNode node = it.next();
+ if (node.getParent().getSuggestion() != null) {
+
+ SuggestionList suggestions = dictionary.getSuggestions(dictionary.keyFormatter(node.getParent().getQuery()));
+ Suggestion suggestion = suggestions.get(node.getParent().getSuggestion());
+
+ if (node.getQuery().equals(node.getParent().getSuggestion())) {
+ // user took our suggestion, increase the score of that suggestion.
+ suggestion.setScore(suggestion.getScore() * getAcceptedSuggestionPositiveAdaptationFactor());
+ } else {
+ // user did not take our suggestion, decrease the score of that suggestion.
+ suggestion.setScore(suggestion.getScore() * getIgnoredSuggestionNegativeAdaptationFactor());
+ }
+
+ suggestions.sort();
+ dictionary.getSuggestionsByQuery().put(suggestions);
+
+
+ }
+ }
+
+ if (numChildrenRecursive == 0) {
+
+ // a single query
+
+ boolean train = false;
+ for (QueryGoalNode.Inspection inspection : goalTreeRoot.getInspections()) {
+ if (inspection.getGoalClassification() > QueryGoalNode.MOO) {
+ train = true;
+ break;
+ }
+ }
+ if (train) {
+ adaptPositive(dictionary, goalTreeRoot.getQuery(), goalTreeRoot.getcorpusQueryResults(), goalTreeRoot);
+ }
+//
+// // if inspected, negative train on top suggestion,
+// // but only if available and not the same as the query
+// SuggestionList suggestions = dictionary.getSuggestions(goalTreeRoot.getQuery());
+// if (goalTreeRoot.getInspections().size() > 0 && suggestions != null && suggestions.size() > 0 && !suggestions.get(0).getSuggested().equals(goalTreeRoot.getQuery())) {
+// suggestions.get(0).setPopularity(suggestions.get(0).getPopularity() * getIgnoredSuggestionNegativeAdaptationFactor());
+// suggestions.sort();
+// }
+ } else {
+
+ /** suggestions are the nodes with inspections, ordered by the classification weight. */
+ List> nodesWithGoals = new LinkedList>();
+
+ /** all the other nodes, them without inspections, will suggest the suggestions. */
+ List> nodesWithoutGoals = new LinkedList>();
+
+ if (goalTreeRoot.getInspections().size() > 0) {
+ nodesWithGoals.add(goalTreeRoot);
+ } else {
+ nodesWithoutGoals.add(goalTreeRoot);
+ }
+ for (Iterator> it = goalTreeRoot.iterateChildrenRecursive(); it.hasNext();) {
+ QueryGoalNode node = it.next();
+ if (node.getInspections().size() > 0) {
+ for (QueryGoalNode.Inspection inspection : node.getInspections()) {
+ if (inspection.getGoalClassification() > QueryGoalNode.MOO) {
+ nodesWithGoals.add(node);
+ break;
+ }
+ }
+ }
+
+ if (!nodesWithGoals.contains(node)) {
+ nodesWithoutGoals.add(node);
+ }
+ }
+
+ if (nodesWithGoals.size() == 0) {
+ // there was no inspections.
+ // we have no clue what the users was looking for. not sure if this mean something extra.
+
+ // call juror strategy
+ nodesWithGoals = getJuror().createGoals(goalTreeRoot);
+ nodesWithoutGoals.removeAll(nodesWithGoals);
+ if (nodesWithGoals.size() == 0) {
+ return;
+ }
+ }
+
+ {
+ // order by inspection weight and time - most recent query with top importance.
+ Collections.sort(nodesWithGoals, new Comparator>() {
+ public int compare(QueryGoalNode queryGoalNode, QueryGoalNode queryGoalNode1) {
+ int ret = Double.compare(queryGoalNode1.calculateInspectionWeight(), queryGoalNode.calculateInspectionWeight());
+ if (ret == 0) {
+ // todo: consider by number of hits? does the same inspection weight but more hits mean something better or something worse?
+
+ // by time
+ if (queryGoalNode1.getTimestamp() == queryGoalNode.getTimestamp()) {
+ return 0;
+ } else if (queryGoalNode1.getTimestamp() > queryGoalNode.getTimestamp()) {
+ return 1;
+ } else {
+ return -1;
+ }
+ } else {
+ return ret;
+ }
+ }
+ });
+
+ if (isTrainingFinalGoalAsSelf()) {
+ // we also train the suggestion as suggested self,
+ // as we store the dictionary key stripped from characters.
+ // this will suggest "the da vinci code" from "thedavincicode" and "the davinci code".
+
+ // it is however not needed to suppres bad suggestions from beeing suggested.
+ // for that its enough with a low enough suggestion score detected by the suggester.
+
+ for (QueryGoalNode node : nodesWithGoals) {
+
+ // but only register it once.
+ SuggestionList suggestions = dictionary.getSuggestions(node.getQuery());
+ if (suggestions == null) {
+ suggestions = dictionary.suggestionListFactory(node.getQuery());
+ }
+
+ if (!suggestions.containsSuggested(nodesWithGoals.get(0).getQuery())) {
+ suggestions.addSuggested(nodesWithGoals.get(0).getQuery(), 1d, nodesWithGoals.get(0).getcorpusQueryResults());
+ dictionary.getSuggestionsByQuery().put(suggestions);
+ }
+
+ // uncomment to adapt every time
+ // adaptPositive(dictionary, nodesWithGoals.get(0).getQuery(), nodesWithGoals.get(0).getCorpusQueryResults(), node);
+ }
+ }
+
+ // suggest back from best goal to second best goal. homm -> heroes of might and magic -> homm
+ if (nodesWithGoals.size() > 1) {
+ adaptPositive(dictionary, nodesWithGoals.get(1).getQuery(), nodesWithGoals.get(1).getcorpusQueryResults(), nodesWithGoals.get(0));
+ }
+
+ // node without inspections are suggested to try the node with inspections that has least edit distance to the query.
+ for (QueryGoalNode node : nodesWithoutGoals) {
+ QueryGoalNode closestNode = findNodeWithShortestDistanceToQuery(node, nodesWithGoals);
+ adaptPositive(dictionary, closestNode.getQuery(), closestNode.getcorpusQueryResults(), node);
+ }
+ }
+ }
+ }
+
+
+ public EditDistance editDistanceFactory(String sd) {
+ return new Levenshtein(sd);
+ }
+
+
+ private QueryGoalNode findNodeWithShortestDistanceToQuery(QueryGoalNode queryNode, Collection> targetNodes) {
+ EditDistance editDistance = editDistanceFactory(queryNode.getQuery());
+ QueryGoalNode closest = null;
+ double closestDistance = Double.MAX_VALUE;
+ for (QueryGoalNode targetNode : targetNodes) {
+ double distanceToTarget = editDistance.getDistance(targetNode.getQuery());
+ if (distanceToTarget < closestDistance) {
+ closestDistance = distanceToTarget;
+ closest = targetNode;
+ }
+ }
+ return closest;
+ }
+
+ private void adaptPositive(Dictionary dictionary, String suggested, Integer suggestedCorpusQueryResults, QueryGoalNode dictionaryKeyNode) throws DatabaseException {
+ SuggestionList suggestions = dictionary.getSuggestions(dictionaryKeyNode.getQuery());
+ if (suggestions == null) {
+ suggestions = dictionary.suggestionListFactory(dictionaryKeyNode.getQuery());
+ suggestions.addSuggested(suggested, 1d, suggestedCorpusQueryResults);
+ } else {
+ boolean suggestionUpdated = false;
+ for (Suggestion existingSuggestion : suggestions) {
+ if (existingSuggestion.getSuggested().equals(suggested)) {
+ // the query already have this suggestion in the suggestions.
+ // increase the score for the suggestion. (positive adaptation)
+ double score = existingSuggestion.getScore() * getNotSuggestedPositiveAdaptationFactor();
+ if (score > 9999) {
+ score = 9999;
+ }
+ existingSuggestion.setScore(score);
+ suggestionUpdated = true;
+ break;
+ }
+ }
+ if (!suggestionUpdated) {
+ suggestions.addSuggested(suggested, 1d, suggestedCorpusQueryResults);
+ }
+ suggestions.sort();
+ }
+ dictionary.getSuggestionsByQuery().put(suggestions);
+ }
+
+
+ public double getAcceptedSuggestionPositiveAdaptationFactor() {
+ return acceptedSuggestionPositiveAdaptationFactor;
+ }
+
+ public void setAcceptedSuggestionPositiveAdaptationFactor(double acceptedSuggestionPositiveAdaptationFactor) {
+ this.acceptedSuggestionPositiveAdaptationFactor = acceptedSuggestionPositiveAdaptationFactor;
+ }
+
+ public double getIgnoredSuggestionNegativeAdaptationFactor() {
+ return ignoredSuggestionNegativeAdaptationFactor;
+ }
+
+ public void setIgnoredSuggestionNegativeAdaptationFactor(double ignoredSuggestionNegativeAdaptationFactor) {
+ this.ignoredSuggestionNegativeAdaptationFactor = ignoredSuggestionNegativeAdaptationFactor;
+ }
+
+ public double getNotSuggestedPositiveAdaptationFactor() {
+ return notSuggestedPositiveAdaptationFactor;
+ }
+
+ public void setNotSuggestedPositiveAdaptationFactor(double notSuggestedPositiveAdaptationFactor) {
+ this.notSuggestedPositiveAdaptationFactor = notSuggestedPositiveAdaptationFactor;
+ }
+
+ public boolean isTrainingFinalGoalAsSelf() {
+ return trainingFinalGoalAsSelf;
+ }
+
+ public void setTrainingFinalGoalAsSelf(boolean trainingFinalGoalAsSelf) {
+ this.trainingFinalGoalAsSelf = trainingFinalGoalAsSelf;
+ }
+
+ public static Comparator getScoreComparator() {
+ return scoreComparator;
+ }
+
+ public static void setScoreComparator(Comparator scoreComparator) {
+ DefaultTrainer.scoreComparator = scoreComparator;
+ }
+
+
+ /**
+ * @return Stragegy used when a query goal tree has no goals.
+ */
+ public QueryGoalJuror getJuror() {
+ return juror;
+ }
+
+ /**
+ * @param juror Stragegy used when a query goal tree has no goals.
+ */
+ public void setJuror(QueryGoalJuror juror) {
+ this.juror = juror;
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/QueryGoalJuror.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/QueryGoalJuror.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/QueryGoalJuror.java (revision 0)
@@ -0,0 +1,37 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.search.didyoumean.QueryGoalNode;
+
+import java.util.List;
+
+/**
+ * When a query goal tree has no goals, an implementation of this interface is used to set them up.
+ *
+ * @author Karl Wettin
+ * Date: 2007-feb-24
+ * Time: 02:25:25
+ */
+public interface QueryGoalJuror {
+
+ /**
+ * @param goalTreeRootNode Query goal tree root node.
+ * @return the goal nodes
+ */
+ public abstract List> createGoals(QueryGoalNode goalTreeRootNode);
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultSuggester.java (revision 0)
@@ -0,0 +1,232 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.search.didyoumean.AbstractSuggester;
+import org.apache.lucene.search.didyoumean.EditDistance;
+import org.apache.lucene.search.didyoumean.Levenshtein;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+import org.apache.lucene.search.didyoumean.dictionary.SuggestionList;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.LinkedList;
+import java.util.List;
+
+/**
+ * Returns highest scoring suggestion available,
+ * unless the score is lower than the suggestion supression threadshold.
+ *
+ * If there are no suggestions available, the second level suggesters
+ * registred to the dictionary are used to produce the suggestions.
+ *
+ * If the top scoring suggestion is same as the query,
+ * and the second best is not supressed below threadshold,
+ * change order
+ *
+ * Ignoring a suggestion 50 times or so with a DefaultTrainer makes a score hit 0.05d.
+ *
+ * It will navigate towards better suggestions this way:
+ *
+ * Suggestion[] currentSuggestions = toQuerySensitiveArray(suggestions, n, query);
+ * Suggestion[] topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested());
+ * // if the input query results in bad results according to some strategy,
+ * // then attempt to navigate towards a top suggestion that does result in something good,
+ * while (
+ * topSuggestionSuggestions != null && topSuggestionSuggestions.length > 0
+ * && (currentSuggestions[0].getCorpusQueryResults().size() == 0
+ * || currentSuggestions[0].getCorpusQueryResults().size() * 3 < topSuggestionSuggestions[0].getCorpusQueryResults().size())) {
+ * currentSuggestions = topSuggestionSuggestions;
+ * topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested());
+ * }
+ *
+ * todo: above should be an aggregated strategy!
+ *
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 4:56:34 PM
+ */
+public class DefaultSuggester extends AbstractSuggester implements Serializable {
+
+ private static final long serialVersionUID = 1l;
+
+
+ public DefaultSuggester() {
+ }
+
+ /**
+ * ignoring a suggestion 50 times or so makes hit 0.05d.
+ */
+ private double suggestionSupressionThreadshold = 0.05d;
+
+ private SuggestionList gatherSuggestionList(Dictionary dictionary, String query, int n) throws DatabaseException {
+ SuggestionList suggestions = dictionary.getSuggestions(query);
+ if (suggestions != null && suggestions.size() > 0) {
+ // if top suggestion is suppressed,
+ // then try to get some more suggestions from second level
+ if (suggestions.get(0).getScore() < getSuggestionSupressionThreadshold()) {
+ Suggestion[] secondLevel = dictionary.getSecondLevelSuggestion(query, n);
+ if (secondLevel != null && secondLevel.length > 0) {
+ // then merge the not yet seen suggestions with the dictionary suggestions.
+ // this way we don't suggest something that has been suppressed.
+ double score = 1d;
+ for (Suggestion suggestion : secondLevel) {
+ score -= 0.01d;
+ if (!suggestions.containsSuggested(suggestion.getSuggested())) {
+ suggestions.addSuggested(suggestion.getSuggested(), score, suggestion.getCorpusQueryResults());
+ dictionary.getSuggestionsByQuery().put(suggestions);
+ }
+ }
+ }
+ }
+ }
+ return suggestions;
+ }
+
+ public Suggestion[] didYouMean(Dictionary dictionary, String query, int n) throws DatabaseException {
+ SuggestionList suggestions = gatherSuggestionList(dictionary, query, n);
+ if (suggestions != null) {
+ if (suggestions.size() > 0) {
+ Suggestion[] originalSuggestions = toQuerySensitiveArray(suggestions, n, query);
+ Suggestion[] currentSuggestions = originalSuggestions;
+ Suggestion[] topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested());
+ // if the input query results in bad results according to some strategy,
+ // then attempt to navigate towards a top suggestion that does result in something good,
+ int noEternalLoops = 0;
+ while (hasBetterNestedSuggestion(topSuggestionSuggestions, currentSuggestions)) {
+ // no eternal loops!
+ if (++noEternalLoops == 100) {
+ System.out.println(query + " points back at it self or is too complex!");
+ return originalSuggestions;
+ }
+ currentSuggestions = topSuggestionSuggestions;
+ topSuggestionSuggestions = toQuerySensitiveArray(gatherSuggestionList(dictionary, currentSuggestions[0].getSuggested(), n), n, currentSuggestions[0].getSuggested());
+ }
+ return currentSuggestions;
+ } else {
+ return dictionary.getSecondLevelSuggestion(query, n);
+ }
+
+ } else {
+ return dictionary.getSecondLevelSuggestion(query, n);
+ }
+ }
+
+ /**
+ * If topSuggestionSuggestions points at something
+ * that has three times as many results as the top current suggestion
+ * then this method returns true.
+ *
+ * if the current suggestion has no hits
+ * and there is a suggestion with hits in topSuggestionSuggestions
+ * this method returns true
+ *
+ * @param topSuggestionSuggestions suggestions to first suggestion in currentSuggestions
+ * @param currentSuggestions suggestions to the current query
+ * @return true if topSuggestionSuggestions has a better suggestion than currentSuggestions
+ */
+ public boolean hasBetterNestedSuggestion(Suggestion[] topSuggestionSuggestions, Suggestion[] currentSuggestions) {
+ return topSuggestionSuggestions != null
+ && topSuggestionSuggestions.length > 0
+ && currentSuggestions[0].getCorpusQueryResults() != null
+ &&
+ ((currentSuggestions[0].getCorpusQueryResults() == 0 && topSuggestionSuggestions[0].getCorpusQueryResults() > 0)
+ || (currentSuggestions[0].getCorpusQueryResults() * 3 < topSuggestionSuggestions[0].getCorpusQueryResults())
+ );
+ }
+
+
+ public EditDistance editDistanceFactory(String sd) {
+ return new Levenshtein(sd);
+ }
+
+
+ private Suggestion[] toQuerySensitiveArray(SuggestionList suggestions, int n, String query) {
+ // build an array that is max n in size.
+
+ LinkedList ret = new LinkedList();
+ suggestions.filter(ret, new SuggestionList.SuggestionFilter() {
+
+ public boolean accept(Suggestion suggestion) {
+ return suggestion.getScore() > getSuggestionSupressionThreadshold();
+ }
+ });
+
+
+ if (ret.size() > 1) {
+
+ // if the top scoring suggestion is same as the query
+ if (query.equals(ret.get(0).getSuggested())) {
+
+ // switch top two suggestions
+ Suggestion s = ret.get(0);
+ ret.set(0, ret.get(1));
+ ret.set(1, s);
+
+ // if any other suggestion has many more hits
+ // then set that as top suggestion.
+
+ EditDistance ed = editDistanceFactory(ret.get(0).getSuggested());
+ List topHits = new LinkedList();
+ if (ret.size() > 2) {
+ for (int i = 2; i < ret.size(); i++) {
+ if (ret.get(i).getCorpusQueryResults() > ret.get(0).getCorpusQueryResults()
+ && ed.getNormalizedDistance(ret.get(i).getSuggested()) > 0.8d /* todo setting */) {
+ topHits.add(ret.get(i));
+ }
+ }
+ }
+ if (topHits.size() > 0) {
+
+ if (topHits.size() > 1) {
+ Collections.sort(topHits, new Comparator() {
+ public int compare(Suggestion suggestion, Suggestion suggestion1) {
+ return new Integer(suggestion.getCorpusQueryResults()).compareTo(suggestion1.getCorpusQueryResults());
+ }
+ });
+ }
+ ret.remove(topHits.get(0));
+ ret.addFirst(topHits.get(0));
+ }
+ }
+// else {
+// // order by number of hits
+// Arrays.sort(ret, new Comparator() {
+// public int compare(Suggestion suggestion, Suggestion suggestion1) {
+// return new Integer(suggestion1.getCorpusQueryResults().size()).compareTo(suggestion.getCorpusQueryResults().size());
+// }
+// });
+// }
+ }
+ return ret.toArray(new Suggestion[n < ret.size() ? n : ret.size()]);
+ }
+
+
+ public double getSuggestionSupressionThreadshold
+ () {
+ return suggestionSupressionThreadshold;
+ }
+
+ public void setSuggestionSupressionThreadshold
+ (
+ double suggestionSupressionThreadshold) {
+ this.suggestionSupressionThreadshold = suggestionSupressionThreadshold;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultAprioriCorpusFactory.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultAprioriCorpusFactory.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/impl/DefaultAprioriCorpusFactory.java (revision 0)
@@ -0,0 +1,65 @@
+package org.apache.lucene.search.didyoumean.impl;
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.index.facade.IndexWriterFacade;
+import org.apache.lucene.search.didyoumean.AprioriCorpusFactory;
+import org.apache.lucene.search.didyoumean.Suggester;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+import org.apache.lucene.search.didyoumean.dictionary.SuggestionList;
+
+import java.io.IOException;
+import java.util.*;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-okt-19
+ * Time: 02:13:00
+ */
+public class DefaultAprioriCorpusFactory implements AprioriCorpusFactory {
+
+ public void factory(Dictionary dictionary, Suggester suggester, IndexFacade aprioriIndex, String aprioriIndexField, Analyzer aprioriAnalyzer) throws IOException, DatabaseException {
+ // create an a priori index based on the inverted dictionary
+
+ System.out.println("Inverting index...");
+ Map inverted = dictionary.inverted();
+
+ IndexWriterFacade aprioriWriter = aprioriIndex.indexWriterFactory(aprioriAnalyzer, true);
+
+// int i=0;
+// int i2=0;
+ System.out.println("Extracting most commonly misspelled words and phrases...");
+ for (Map.Entry e : inverted.entrySet()) {
+ if (e.getValue().size() > 1) {
+ String suggested = suggester.didYouMean(dictionary, e.getKey());
+ if (suggested != null && suggested.equalsIgnoreCase(e.getKey())) {
+ Document d = new Document();
+ d.add(new Field(aprioriIndexField, e.getKey(), Field.Store.NO, Field.Index.TOKENIZED, Field.TermVector.WITH_POSITIONS));
+ aprioriWriter.addDocument(d);
+// i2++;
+// System.out.println(i + "\t" + i2);
+ }
+ }
+// i++;
+ }
+ aprioriWriter.close();
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/SecondLevelTokenPhraseSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/SecondLevelTokenPhraseSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/SecondLevelTokenPhraseSuggester.java (revision 0)
@@ -0,0 +1,49 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.search.didyoumean.SecondLevelSuggester;
+import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue;
+
+import java.io.IOException;
+
+/**
+ * Makes TokenPhraseSuggesterImpl a SecondLevelSuggester.
+ *
+ * todo: this is an ugly class. decoration?
+ *
+ * @author Karl Wettin
+ * Date: 2007-feb-17
+ * Time: 08:03:01
+ */
+public class SecondLevelTokenPhraseSuggester extends TokenPhraseSuggesterImpl implements SecondLevelSuggester {
+
+ public SecondLevelTokenPhraseSuggester(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer phraseAnalyzer, IndexFacade aprioriIndex) throws IOException {
+ super(tokenSuggester, aprioriIndexField, defaultSuggestMorePopularTokensOnly, defaultMaxSuggestionsPerToken, phraseAnalyzer, aprioriIndex);
+ }
+
+ public SuggestionPriorityQueue suggest(String query) {
+ return suggest(query, 1);
+ }
+
+
+ public boolean hasPersistableSuggestions() {
+ return true;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/MultiTokenSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/MultiTokenSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/MultiTokenSuggester.java (revision 0)
@@ -0,0 +1,93 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.didyoumean.SecondLevelSuggester;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * Uses term queries rather than span near queries.
+ *
+ * @author karl wettin
+ * Date: 2007-okt-23
+ * Time: 04:33:29
+ */
+public class MultiTokenSuggester extends TokenPhraseSuggester implements SecondLevelSuggester {
+
+ private IndexFacade aprioriIndex;
+ private IndexReader aprioriReader;
+ private IndexSearcher aprioriSearcher;
+
+
+ protected boolean isUpdateSuggestionsOrder() {
+ return false;
+ }
+
+ public boolean hasPersistableSuggestions() {
+ return false;
+ }
+
+
+ /**
+ *
+ * @param tokenSuggester
+ * @param aprioriIndexField
+ * @param defaultSuggestMorePopularTokensOnly it makes sense setting this to true
+ * @param defaultMaxSuggestionsPerToken
+ * @param queryAnalyzer
+ * @param aprioriIndex
+ * @throws IOException
+ */
+ public MultiTokenSuggester(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer queryAnalyzer, IndexFacade aprioriIndex) throws IOException {
+ super(tokenSuggester, aprioriIndexField, defaultSuggestMorePopularTokensOnly, defaultMaxSuggestionsPerToken, queryAnalyzer);
+ this.aprioriIndex = aprioriIndex;
+ this.aprioriReader = aprioriIndex.indexReaderFactory();
+ this.aprioriSearcher = new IndexSearcher(aprioriReader);
+ }
+
+ public SuggestionPriorityQueue suggest(String query) {
+ return suggest(query, 1);
+ }
+
+ protected Query suggestionAprioriQueryFactory(Suggestion[] suggestions) {
+ BooleanQuery bq = new BooleanQuery();
+ for (Suggestion suggestion : suggestions) {
+ bq.add(new TermQuery(new Term(getAprioriIndexField(), suggestion.getSuggested())), BooleanClause.Occur.MUST);
+ }
+ return bq;
+ }
+
+ public IndexFacade getAprioriIndex() {
+ return aprioriIndex;
+ }
+
+ protected IndexReader getAprioriReader() {
+ return aprioriReader;
+ }
+
+ protected IndexSearcher getAprioriSearcher() {
+ return aprioriSearcher;
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TermEnumIterator.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TermEnumIterator.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/TermEnumIterator.java (revision 0)
@@ -0,0 +1,85 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token.ngram;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+/**
+ * @author Nicolas Maisonneuve
+ * @author Karl Wettin
+ * Date: 2007-feb-03
+ * Time: 06:40:59
+ */
+public class TermEnumIterator implements Iterator {
+
+ private String field;
+
+ private TermEnum termEnum;
+ private Term actualTerm;
+ private boolean hasNextCalled;
+
+ public TermEnumIterator(IndexReader reader, String field) {
+ this.field = field.intern();
+ try {
+ termEnum = reader.terms(new Term(field, ""));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ public String next() {
+ if (!hasNextCalled) {
+ hasNext();
+ }
+ hasNextCalled = false;
+ return (actualTerm != null) ? actualTerm.text() : null;
+ }
+
+ public boolean hasNext() {
+ if (hasNextCalled) {
+ return actualTerm != null;
+ }
+ hasNextCalled = true;
+ try {
+ // if there are no more words
+ if (!termEnum.next()) {
+ actualTerm = null;
+ return false;
+ }
+ // if the next word is in the field
+ actualTerm = termEnum.term();
+ String currentField = actualTerm.field();
+ if (currentField != field) {
+ actualTerm = null;
+ return false;
+ }
+ return true;
+ } catch (IOException e) {
+ e.printStackTrace();
+ return false;
+ }
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/NgramTokenSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/NgramTokenSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/ngram/NgramTokenSuggester.java (revision 0)
@@ -0,0 +1,372 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token.ngram;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.facade.IndexWriterFacade;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.*;
+import org.apache.lucene.search.didyoumean.EditDistance;
+import org.apache.lucene.search.didyoumean.Levenshtein;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue;
+import org.apache.lucene.search.didyoumean.secondlevel.token.TokenSuggester;
+import org.apache.lucene.search.didyoumean.secondlevel.token.TokenSuggestion;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.Set;
+
+/**
+ * A single token word suggester based on n-grams.
+ *
+ * Uses Lucene for persistency and token n-gram matching.
+ *
+ * @author initially inspired by the David Spencer code.
+ * @author Nicolas Maisonneuve
+ * @author Karl Wettin
+ * Date: 2007-feb-03
+ * Time: 05:29:56
+ */
+public class NgramTokenSuggester implements TokenSuggester {
+
+ private IndexFacade ngramIndex;
+ private IndexReader ngramReader;
+ private IndexSearcher ngramSearcher;
+
+ public NgramTokenSuggester(IndexFacade ngramIndex) throws IOException {
+ this.ngramIndex = ngramIndex;
+ ngramReader = ngramIndex.indexReaderFactory();
+ ngramSearcher = new IndexSearcher(ngramReader);
+ }
+
+
+ public IndexReader getNgramReader() {
+ return ngramReader;
+ }
+
+ public IndexSearcher getNgramSearcher() {
+ return ngramSearcher;
+ }
+
+ public IndexFacade getNgramIndex() {
+ return ngramIndex;
+ }
+
+ private int defaultHitEnumerationsPerSuggestion = 10;
+
+ /**
+ * Field name for each word in the ngram index.
+ */
+ public static final String F_WORD = "word";
+
+ /**
+ * Boost value for start and end grams
+ */
+ private float bStart = 2.0f;
+ private float bEnd = 1.0f;
+
+ // minimum score for hits generated by the spell checker query
+ private float minScore = 0.5f;
+
+ /**
+ * Sets the accuracy 0 < minScore < 1; default 0.5
+ */
+ public void setAccuracy(float min) {
+ this.minScore = min;
+ }
+
+ private boolean defaultSuggestSelf = false;
+ private boolean defaultSuggestOnlyMorePopularTokens = false;
+
+ public SuggestionPriorityQueue suggest(String queryToken, int n) throws IOException {
+ return suggest(queryToken, n, defaultSuggestSelf, null, null, false, getDefaultHitEnumerationsPerSuggestion());
+ }
+
+ public SuggestionPriorityQueue suggest(String queryToken, int n, boolean suggestSelf, IndexReader aprioriIndexReader, String aprioriIndexField, boolean selectMorePopularTokensOnly) throws IOException {
+ return suggest(queryToken, n, suggestSelf, aprioriIndexReader, aprioriIndexField, selectMorePopularTokensOnly, getDefaultHitEnumerationsPerSuggestion());
+ }
+
+ /**
+ * Suggest similar words (restricted or not to a field of a user index)
+ *
+ * @param queryToken the word you want a spell check done on
+ * @param maxSuggestions the number of suggest words
+ * @param suggestSelf if true, a suggestion can be the queried token.
+ * @param aprioriIndexReader
+ * @param aprioriIndexField the field of the user index: if field is not null, the suggested
+ * words are restricted to the words present in this field.
+ * @param suggestOnyMorePopularTokens if true, suggest only tokens that are more frequent than the query token
+ * (only if restricted mode = (aprioriIndex!=null and aprioriIndexField!=null)
+ * @param hitEnumerationsPerSuggestion number of ngram document to measure edit distance on for each number of expected returned suggestions. @return suggestions the query token
+ * @throws IOException if something went wrong in either aprioriIndex or ngramIndex.
+ */
+ public SuggestionPriorityQueue suggest(String queryToken, int maxSuggestions, boolean suggestSelf, IndexReader aprioriIndexReader,
+ String aprioriIndexField, boolean suggestOnyMorePopularTokens, int hitEnumerationsPerSuggestion) throws IOException {
+
+ SuggestionPriorityQueue queue = new SuggestionPriorityQueue(maxSuggestions);
+
+ float minScore = this.minScore;
+ final EditDistance editDistance = editDistanceFactory(queryToken);
+ final int tokenLength = queryToken.length();
+
+ final int goalFreq = (suggestOnyMorePopularTokens && aprioriIndexReader != null) ? aprioriIndexReader.docFreq(new Term(aprioriIndexField, queryToken)) : 0;
+ // if the word exists in the real index and we don't care for word frequency, return the word itself
+ if (!suggestOnyMorePopularTokens && goalFreq > 0) {
+ queue.insert(new Suggestion(queryToken));
+ return queue;
+ }
+
+ BooleanQuery query = new BooleanQuery();
+ String[] grams;
+ String key;
+
+ for (int ng = getMin(tokenLength); ng <= getMax(tokenLength); ng++) {
+
+ key = "gram" + ng; // form key
+
+ grams = formGrams(queryToken, ng); // form word into ngrams (allow dups too)
+
+ if (grams.length == 0) {
+ continue; // hmm
+ }
+
+ if (bStart > 0) { // should we boost prefixes?
+ add(query, "start" + ng, grams[0], bStart); // matches start of word
+
+ }
+ if (bEnd > 0) { // should we boost suffixes
+ add(query, "end" + ng, grams[grams.length - 1], bEnd); // matches end of word
+
+ }
+ for (String gram : grams) {
+ add(query, key, gram);
+ }
+ }
+
+// System.out.println("Q: " + query);
+ Hits hits = ngramSearcher.search(query);
+// System.out.println("HITS: " + hits.length());
+
+ // go thru more than 'maxr' matches in case the distance filter triggers
+ TokenSuggestion suggestion = new TokenSuggestion();
+ int stop = maxSuggestions * hitEnumerationsPerSuggestion;
+ for (int currentHit = 0; currentHit < hits.length() && currentHit < stop; currentHit++) {
+ suggestion.setSuggested(hits.doc(currentHit).get(F_WORD)); // get orig word
+
+ // don't suggest a word for itself, that would be silly
+ if (!suggestSelf && suggestion.getSuggested().equals(queryToken)) {
+ continue;
+ }
+
+ // edit distance/normalize with the minScore word length
+ suggestion.setScore(1.0d - ((double) editDistance.getDistance(suggestion.getSuggested()) / Math
+ .min(suggestion.getSuggested().length(), tokenLength)));
+ if (suggestion.getScore() < minScore) {
+ continue;
+ }
+
+ if (aprioriIndexReader != null) { // use the user index
+ suggestion.setFrequency(aprioriIndexReader.docFreq(new Term(aprioriIndexField, suggestion.getSuggested()))); // freq in the index
+ // don't suggest a word that is not present in the field
+ if ((suggestOnyMorePopularTokens && goalFreq > suggestion.getFrequency()) || suggestion.getFrequency() < 1) {
+ continue;
+ }
+ }
+ queue.insert(suggestion);
+
+
+ suggestion = new TokenSuggestion();
+ }
+
+ return queue;
+ }
+
+ /**
+ * Add a clause to a boolean query.
+ */
+ private void add(BooleanQuery q, String name, String value, float boost) {
+ Query tq = new TermQuery(new Term(name, value));
+ tq.setBoost(boost);
+ q.add(new BooleanClause(tq, BooleanClause.Occur.SHOULD));
+ }
+
+ /**
+ * Add a clause to a boolean query.
+ */
+ private void add(BooleanQuery q, String name, String value) {
+ q.add(new BooleanClause(new TermQuery(new Term(name, value)), BooleanClause.Occur.SHOULD));
+ }
+
+ /**
+ * Form all ngrams for a given word.
+ *
+ * @param text the word to parse
+ * @param ng the ngram length e.g. 3
+ * @return an array of all ngrams in the word and note that duplicates are not removed
+ */
+ private String[] formGrams(String text, int ng) {
+ int len = text.length();
+ String[] res = new String[len - ng + 1];
+ for (int i = 0; i < len - ng + 1; i++) {
+ res[i] = text.substring(i, i + ng);
+ }
+ return res;
+ }
+
+ /**
+ * Index a Dictionary
+ *
+ * @param tokens the dictionary to index
+ * @throws IOException
+ */
+ public void indexDictionary(Iterator tokens) throws IOException {
+ indexDictionary(tokens, 3);
+ }
+
+ /**
+ * Index a Dictionary
+ *
+ * @param tokens the dictionary to index
+ * @param minTokenLength minimum size of token to be suggestable. 2 if you want "on" to suggest "in".
+ * @throws IOException
+ */
+ public void indexDictionary(Iterator tokens, int minTokenLength) throws IOException {
+ if (minTokenLength < 2) {
+ minTokenLength = 2;
+ }
+ IndexWriterFacade writer = ngramIndex.indexWriterFactory(new StandardAnalyzer(Collections.EMPTY_SET), false);
+ //writer.setMergeFactor(300);
+ //writer.setMaxBufferedDocs(150);
+
+ Set unflushedTokens = new HashSet(1000);
+
+ while (tokens.hasNext()) {
+ String token = tokens.next();
+
+ int len = token.length();
+ if (len < minTokenLength) {
+ continue; // too short we bail but "too long" is fine...
+ }
+
+ if (unflushedTokens.contains(token) || ngramReader.docFreq(new Term(F_WORD, token)) > 0) {
+ continue;
+ }
+
+ // ok index the word
+ Document doc = createDocument(token, getMin(len), getMax(len));
+ writer.addDocument(doc);
+ unflushedTokens.add(token);
+
+ }
+
+ writer.optimize();
+ writer.close();
+
+ IndexSearcher oldSearcher = ngramSearcher;
+ IndexReader oldReader = ngramReader;
+
+ ngramReader = ngramIndex.indexReaderFactory();
+ ngramSearcher = new IndexSearcher(ngramReader);
+
+ oldSearcher.close();
+ oldReader.close();
+
+ }
+
+ private int getMin(int l) {
+ if (l > 5) {
+ return 3;
+ }
+ if (l == 5) {
+ return 2;
+ }
+ return 1;
+ }
+
+ private int getMax(int l) {
+ if (l > 5) {
+ return 4;
+ }
+ if (l == 5) {
+ return 3;
+ }
+ return 2;
+ }
+
+ private Document createDocument(String text, int ng1, int ng2) {
+ Document doc = new Document();
+ doc.add(new Field(F_WORD, text, Field.Store.YES, Field.Index.UN_TOKENIZED)); // orig term
+ addGram(text, doc, ng1, ng2);
+ return doc;
+ }
+
+ private void addGram(String text, Document doc, int ng1, int ng2) {
+ int len = text.length();
+ for (int ng = ng1; ng <= ng2; ng++) {
+ String key = "gram" + ng;
+ String end = null;
+ for (int i = 0; i < len - ng + 1; i++) {
+ String gram = text.substring(i, i + ng);
+ doc.add(new Field(key, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ if (i == 0) {
+ doc.add(new Field("start" + ng, gram, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ }
+ end = gram;
+ }
+ if (end != null) { // may not be present if len==ng1
+ doc.add(new Field("end" + ng, end, Field.Store.YES, Field.Index.UN_TOKENIZED));
+ }
+ }
+ }
+
+
+ public boolean isDefaultSuggestSelf() {
+ return defaultSuggestSelf;
+ }
+
+ public void setDefaultSuggestSelf(boolean defaultSuggestSelf) {
+ this.defaultSuggestSelf = defaultSuggestSelf;
+ }
+
+ public boolean isDefaultSuggestOnlyMorePopularTokens() {
+ return defaultSuggestOnlyMorePopularTokens;
+ }
+
+ public void setDefaultSuggestOnlyMorePopularTokens(boolean defaultSuggestOnlyMorePopularTokens) {
+ this.defaultSuggestOnlyMorePopularTokens = defaultSuggestOnlyMorePopularTokens;
+ }
+
+
+ public int getDefaultHitEnumerationsPerSuggestion() {
+ return defaultHitEnumerationsPerSuggestion;
+ }
+
+ public void setDefaultHitEnumerationsPerSuggestion(int defaultHitEnumerationsPerSuggestion) {
+ this.defaultHitEnumerationsPerSuggestion = defaultHitEnumerationsPerSuggestion;
+ }
+
+
+ public EditDistance editDistanceFactory(String sa) {
+ return new Levenshtein(sa);
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggestion.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggestion.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggestion.java (revision 0)
@@ -0,0 +1,77 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.search.didyoumean.Suggestion;
+
+import java.io.Serializable;
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-03
+ * Time: 05:43:03
+ */
+public class TokenSuggestion extends Suggestion implements Serializable {
+
+ // private static Log log = LogFactory.getLog(SingleTokenSuggestion.class);
+ private static long serialVersionUID = 1l;
+
+ private int frequency;
+
+
+ public TokenSuggestion() {
+ }
+
+ public TokenSuggestion(String suggested, double score, int frequency) {
+ super(suggested, score);
+ this.frequency = frequency;
+ }
+
+
+ public int getFrequency() {
+ return frequency;
+ }
+
+ public void setFrequency(int frequency) {
+ this.frequency = frequency;
+ }
+
+
+ public int compareTo(Suggestion suggestion) {
+ if (suggestion instanceof TokenSuggestion) {
+ TokenSuggestion tokenSuggestion = (TokenSuggestion) suggestion;
+ // first criteria: the edit distance
+ if (getScore() > tokenSuggestion.getScore()) {
+ return 1;
+ }
+ if (getScore() < tokenSuggestion.getScore()) {
+ return -1;
+ }
+
+ // second criteria (if first criteria is equal): the popularity
+ if (getFrequency() > tokenSuggestion.getFrequency()) {
+ return 1;
+ }
+
+ if (getFrequency() < tokenSuggestion.getFrequency()) {
+ return -1;
+ }
+ return 0;
+ } else {
+ return super.compareTo(suggestion);
+ }
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenSuggester.java (revision 0)
@@ -0,0 +1,33 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue;
+
+import java.io.IOException;
+
+
+/**
+ * A suggester that can only handle single token words.
+ *
+ * @author Karl Wettin
+ * Date: 2007-jan-30
+ * Time: 06:05:54
+ */
+public interface TokenSuggester {
+ public abstract SuggestionPriorityQueue suggest(String queryToken, int n, boolean suggestSelf, IndexReader aprioriIndexReader, String aprioriIndexField, boolean selectMorePopularTokensOnly) throws IOException;
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggester.java (revision 0)
@@ -0,0 +1,448 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.TermFreqVector;
+import org.apache.lucene.index.TermPositionVector;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.search.Hits;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.*;
+
+/**
+ * A layer on top of the single token suggesting {@link TokenSuggester} that enables muti token (phrase) suggestions.
+ *
+ * Pretty much the same thing as a SpanFuzzyQuery.
+ *
+ * Places a matrix of {@link org.apache.lucene.search.spans.SpanNearQuery} to find valid suggestions. If any of the
+ * valid hits contains a {@link org.apache.lucene.index.TermPositionVector}, it will be analyzed and suggest the query
+ * in the order of terms in the index.
+ * todo: if term positions available and stored, suggest that for cosmetic reasons in case of stemming et c.)
+ * todo: E.g. query "camel that broke the staw" is suggested with "straw that broke the camel"
+ *
+ *
+ * @author Karl Wettin
+ * Date: 2007-jan-30
+ * Time: 05:24:38
+ */
+public abstract class TokenPhraseSuggester {
+
+ private TokenSuggester tokenSuggester;
+
+ /**
+ * this is a nasty hack due to MultiTokenSuggester using this code.
+ * @return true if this implementation allows inspection of term vector to change order of query
+ */
+ protected abstract boolean isUpdateSuggestionsOrder();
+
+ /**
+ * @return the index used for checking that suggestions match something, inspecting term position vector- and term frequency inspection.
+ * @see TokenPhraseSuggester#getAprioriIndexField()
+ */
+ protected abstract IndexFacade getAprioriIndex();
+
+ protected abstract IndexReader getAprioriReader();
+
+ protected abstract IndexSearcher getAprioriSearcher();
+
+ /**
+ * @param suggestions suggestion to be made in to a query
+ * @return a query that much match in order for the suggestion to be suggested
+ */
+ protected abstract Query suggestionAprioriQueryFactory(Suggestion[] suggestions);
+
+ /**
+ * if true, the ngram suggester will only suggest words more frequent than the query word
+ */
+ private boolean defaultSuggestMorePopularTokensOnly;
+
+ /**
+ * used by the single token ngram spell checker when suggestMorePopularTokensOnly is true
+ * in the apriori index reader to check frequency.
+ */
+ private String aprioriIndexField;
+
+
+ /**
+ * number of suggestion per token in phrase.
+ * there will be (n^tokens in phrase) queries placed on the index to find the best suggestion.
+ * e.g. "three token phrase" and n=5 might results in 243 queries on the apriori index.
+ */
+ private int defaultMaxSuggestionsPerToken = 3;
+ private Analyzer queryAnalyzer;
+
+ /**
+ * @param tokenSuggester the single token suggester that backs this phrase suggester
+ * @param aprioriIndexField the document field used for term frequency inspection at token suggestion level.
+ * @param defaultSuggestMorePopularTokensOnly
+ * if true, at token suggestion level, only suggest tokens that are more popular than the one in the original query.
+ * @param defaultMaxSuggestionsPerToken number of suggestion per token in phrase. there will be (n^tokens in phrase) queries placed on the index to find the best suggestion. e.g. "three token phrase" and n=5 might results in 243 queries on the apriori index.
+ * @param queryAnalyzer the analyzer used to tokenize phrases
+ * @throws java.io.IOException if something goes wrong in either the ngram spell checker or in the apriori index
+ */
+ public TokenPhraseSuggester(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer queryAnalyzer) throws IOException {
+ this.tokenSuggester = tokenSuggester;
+ this.aprioriIndexField = aprioriIndexField;
+ this.defaultSuggestMorePopularTokensOnly = defaultSuggestMorePopularTokensOnly;
+ this.defaultMaxSuggestionsPerToken = defaultMaxSuggestionsPerToken;
+ this.queryAnalyzer = queryAnalyzer;
+ }
+
+ public String didYouMean(String query) {
+ SuggestionPriorityQueue suggestions = suggest(query, 1);
+ if (suggestions.size() > 0) {
+ return ((Suggestion) suggestions.top()).getSuggested();
+ } else {
+ return null;
+ }
+ }
+
+ /**
+ * If this returns true,
+ * then didYouMean() will place this phrase as a query against the apriori index
+ * to see if it is a good suggestion or not.
+ *
+ * @param phrase the suggested phrase
+ * @param query the user input that requests the suggestion
+ * @return true if the phrase is known to the sub class
+ */
+ protected boolean isSuggestablePhrase(String phrase, String query) {
+ return true;
+ }
+
+
+ public boolean suggestionArraysEquals(Suggestion[] suggestions, Suggestion[] suggestions1) {
+ if (suggestions == suggestions1) {
+ return true;
+ }
+ if (suggestions == null /*&& suggestions1 != null*/) {
+ return false;
+ }
+ if (/*suggestions != null &&*/ suggestions1 == null) {
+ return false;
+ }
+ if (suggestions.length != suggestions1.length) {
+ return false;
+ }
+ for (int i = 0; i < suggestions.length; i++) {
+ if (!suggestions[i].getSuggested().equals(suggestions1[i].getSuggested())) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+
+ /**
+ * @param query user input
+ * @param maxSuggestions will not return more than this many suggestions
+ * @return suggestions found, in natural order
+ */
+ public SuggestionPriorityQueue suggest(String query, int maxSuggestions) {
+ return suggest(query, maxSuggestions, getDefaultMaxSuggestionsPerToken(), isDefaultSuggestMorePopularTokensOnly());
+ }
+
+ /**
+ * @param query user input
+ * @param maxSuggestions limits results to this many suggestions
+ * @param maxSuggestionsPerToken tokens in the phrase will contain max this many suggestions
+ * @param suggestMorePopularTokensOnly true if the token suggester should suggest only more popular terms. not recommended.
+ * @return suggestions found, in natural order
+ */
+ public SuggestionPriorityQueue suggest(String query, int maxSuggestions, int maxSuggestionsPerToken, boolean suggestMorePopularTokensOnly) {
+
+ long ms = System.currentTimeMillis();
+
+ // build a matrix with all suggestions per token in query.
+
+ final List matrix = new LinkedList();
+
+ TokenStream ts = getQueryAnalyzer().tokenStream(null, new StringReader(query));
+ try {
+ Token token;
+ while ((token = ts.next()) != null) {
+ try {
+ SuggestionPriorityQueue suggestions = tokenSuggester.suggest(token.termText(), maxSuggestionsPerToken, true, getAprioriReader(), getAprioriIndexField(), suggestMorePopularTokensOnly);
+ if (suggestions.size() == 0) {
+ suggestions.insert(new Suggestion(token.termText()));
+ }
+ matrix.add(suggestions.toArray());
+
+ } catch (IOException ioe) {
+ throw new RuntimeException("Exception caught while looking for a suggestion to " + query, ioe);
+ }
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException("Error tokenizing " + query, ioe);
+ }
+
+ /*
+ * iterate through all possible combinations of suggetions in the matrix
+ *
+ * the best game
+ * tho rest fame
+ * lame
+ *
+ * tho best game
+ * tho best fame
+ * tho best lame
+ * tho rest game
+ * tho rest fame
+ * tho rest lame
+ * the best game
+ * the best fame
+ * the best lame
+ * the rest game
+ * the rest fame
+ * the rest lame
+ *
+ */
+ Iterator itAllCombinations = new Iterator() {
+ private int[] counter = new int[matrix.size()];
+
+
+ public void remove() {
+ throw new IllegalStateException("not implemented");
+ }
+
+ public boolean hasNext() {
+ int s = counter.length;
+ return counter[s - 1] < matrix.get(s - 1).length;
+ }
+
+ public Suggestion[] next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException("no more elements");
+ }
+ Suggestion[] ls = new Suggestion[counter.length];
+ for (int i = 0; i < counter.length; i++) {
+ ls[i] = (matrix.get(i)[counter[i]]);
+ }
+ incrementCounter();
+ return ls;
+ }
+
+ private void incrementCounter() {
+ for (int i = 0; i < counter.length; i++) {
+ counter[i]++;
+ if (counter[i] == matrix.get(i).length &&
+ i < counter.length - 1) {
+ counter[i] = 0;
+ } else {
+ break;
+ }
+ }
+ }
+ };
+
+ SuggestionPriorityQueue queue = new SuggestionPriorityQueue(maxSuggestionsPerToken);
+
+
+ int queryCounter = 0;
+ int currentHit = 0;
+ Hits hits = null;
+ Suggestion[] suggestions = null;
+ while (hits != null || itAllCombinations.hasNext()) {
+ Integer corpusQueryResults = null;
+
+ if (hits == null) {
+ suggestions = itAllCombinations.next();
+ queryCounter++;
+
+ StringBuilder sb = new StringBuilder(10 * matrix.size());
+ for (Suggestion suggestion : suggestions) {
+ sb.append(suggestion.getSuggested());
+ sb.append(" ");
+ }
+ sb.deleteCharAt(sb.length() - 1);
+ String phrase = sb.toString();
+
+ //System.out.println(cnt + "\t" + phrase);
+
+ if (isSuggestablePhrase(phrase, query)) {
+ Query nextQuery = suggestionAprioriQueryFactory(suggestions);
+ try {
+
+ hits = getAprioriSearcher().search(nextQuery);
+ corpusQueryResults = hits.length();
+ if (hits.length() == 0) {
+ hits = null;
+ continue;
+ } else {
+ currentHit = 0;
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException("Exception caught while searching for " + nextQuery.toString(), ioe);
+ }
+ } else {
+ continue;
+ }
+ } else {
+ currentHit++;
+ }
+
+
+ if (isUpdateSuggestionsOrder()) {
+
+ // todo refactor, this is a nasty hack due to MultiTokenSuggester using this code.
+
+
+ if (currentHit < hits.length()) {
+
+ // attempt to figure out the order of the tokens in the phrase
+
+ try {
+ TermFreqVector termFreqVector = getAprioriReader().getTermFreqVector(hits.id(currentHit), getAprioriIndexField());
+ if (termFreqVector != null && termFreqVector instanceof TermPositionVector) {
+ TermPositionVector termPosVector = (TermPositionVector) termFreqVector;
+ final int[][] suggestionPositions = new int[suggestions.length][];
+
+ for (int i = 0; i < suggestionPositions.length; i++) {
+ int termIndex = Arrays.binarySearch(termFreqVector.getTerms(), suggestions[i].getSuggested());
+ suggestionPositions[i] = termPosVector.getTermPositions(termIndex);
+ }
+
+ // todo: if pos vectors and stored data, then extract the suggestion as stored.
+// TermVectorOffsetInfo[] offsets = termPosVector.getOffsets(termIndex);
+// if (offsets != null) {
+// suggestionOffsets[i][0] = i;
+// suggestionOffsets[i][1] = i;
+// }
+
+ // todo: find the actual position area that matched, rather than just looking at the first occurances.
+ // todo: at least optimze!
+ Map positions = new HashMap(suggestions.length);
+ for (int i = 0; i < suggestions.length; i++) {
+ positions.put(suggestions[i], suggestionPositions[i][0]);
+ }
+ Map.Entry[] ordered = positions.entrySet().toArray((Map.Entry[]) new Map.Entry[0]);
+ Arrays.sort(ordered, new Comparator>() {
+
+ public int compare(Map.Entry entry, Map.Entry entry1) {
+ return entry.getValue().compareTo(entry1.getValue());
+ }
+ });
+ for (int i = 0; i < ordered.length; i++) {
+ suggestions[i] = ordered[i].getKey();
+ }
+
+ // token order found
+
+ } else {
+ // to term vector. look for one in next hit.
+ // todo some setting that skip this when score is lower than n or current hit greather than n.
+ if (hits.length() - 1 == currentHit) {
+ // fall back on user input
+ } else {
+ continue;
+ }
+ }
+ } catch (IOException ioe) {
+ //throw new RuntimeException("Exception caught when inspecting the term position vectors.", ioe);
+ }
+ }
+ }
+
+ StringBuilder stringBuilder = new StringBuilder(matrix.size() * 10);
+ for (Suggestion suggestion : suggestions) {
+ stringBuilder.append(suggestion.getSuggested());
+ stringBuilder.append(' ');
+ }
+ stringBuilder.deleteCharAt(stringBuilder.length() - 1);
+
+ String suggested = stringBuilder.toString();
+
+ double score = 0;
+ for (Suggestion suggestion : suggestions) {
+ score += suggestion.getScore();
+ }
+ score *= hits.length();
+
+ queue.insert(new Suggestion(suggested, score, corpusQueryResults));
+
+ hits = null;
+
+ }
+
+ System.out.println("Built matrix with " + queryCounter + " queries and executed them in " + (System.currentTimeMillis() - ms) + " milliseconds.");
+
+ return queue;
+ }
+
+
+ public TokenSuggester getTokenSuggester() {
+ return tokenSuggester;
+ }
+
+ public void setTokenSuggester(TokenSuggester tokenSuggester) {
+ this.tokenSuggester = tokenSuggester;
+ }
+
+ /**
+ * @return the document field used for term frequency inspection at token suggestion level.
+ */
+ public String getAprioriIndexField() {
+ return aprioriIndexField;
+ }
+
+ /**
+ * @param aprioriIndexField the document field used for term frequency inspection at token suggestion level.
+ */
+ public void setAprioriIndexField(String aprioriIndexField) {
+ this.aprioriIndexField = aprioriIndexField;
+ }
+
+ /**
+ * // todo: consider if false will allow for "lost on translation" to suggest "lost in translation".
+ * // todo: will single token phrases then start suggesting a lot of bad things?
+ *
+ * @return if true, at token suggestion level, only suggest tokens that are more popular than the one in the original query.
+ */
+ public boolean isDefaultSuggestMorePopularTokensOnly() {
+ return defaultSuggestMorePopularTokensOnly;
+ }
+
+ public void setDefaultSuggestMorePopularTokensOnly(boolean defaultSuggestMorePopularTokensOnly) {
+ this.defaultSuggestMorePopularTokensOnly = defaultSuggestMorePopularTokensOnly;
+ }
+
+
+ public int getDefaultMaxSuggestionsPerToken() {
+ return defaultMaxSuggestionsPerToken;
+ }
+
+ public void setDefaultMaxSuggestionsPerToken(int defaultMaxSuggestionsPerToken) {
+ this.defaultMaxSuggestionsPerToken = defaultMaxSuggestionsPerToken;
+ }
+
+ public Analyzer getQueryAnalyzer() {
+ return queryAnalyzer;
+ }
+
+ public void setQueryAnalyzer(Analyzer queryAnalyzer) {
+ this.queryAnalyzer = queryAnalyzer;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggesterImpl.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggesterImpl.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/TokenPhraseSuggesterImpl.java (revision 0)
@@ -0,0 +1,71 @@
+package org.apache.lucene.search.didyoumean.secondlevel.token;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+
+import java.io.IOException;
+
+/**
+ * @author Karl Wettin
+ * Date: 2007-feb-03
+ * Time: 08:11:34
+ */
+public class TokenPhraseSuggesterImpl extends TokenPhraseSuggester {
+
+ private IndexFacade aprioriIndex;
+ private IndexReader aprioriReader;
+ private IndexSearcher aprioriSearcher;
+
+
+ protected boolean isUpdateSuggestionsOrder() {
+ return true;
+ }
+
+ public TokenPhraseSuggesterImpl(TokenSuggester tokenSuggester, String aprioriIndexField, boolean defaultSuggestMorePopularTokensOnly, int defaultMaxSuggestionsPerToken, Analyzer phraseAnalyzer, IndexFacade aprioriIndex) throws IOException {
+ super(tokenSuggester, aprioriIndexField, defaultSuggestMorePopularTokensOnly, defaultMaxSuggestionsPerToken, phraseAnalyzer);
+ this.aprioriIndex = aprioriIndex;
+ this.aprioriReader = aprioriIndex.indexReaderFactory();
+ this.aprioriSearcher = new IndexSearcher(aprioriReader);
+ }
+
+ protected Query suggestionAprioriQueryFactory(Suggestion[] suggestions) {
+ SpanTermQuery[] clauses = new SpanTermQuery[suggestions.length];
+ for (int i = 0; i < suggestions.length; i++) {
+ clauses[i] = new SpanTermQuery(new Term(getAprioriIndexField(), suggestions[i].getSuggested()));
+ }
+ return new SpanNearQuery(clauses, 5, false);
+ }
+
+ public IndexFacade getAprioriIndex() {
+ return aprioriIndex;
+ }
+
+ protected IndexReader getAprioriReader() {
+ return aprioriReader;
+ }
+
+ protected IndexSearcher getAprioriSearcher() {
+ return aprioriSearcher;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/package.html
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/package.html (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/secondlevel/token/package.html (revision 0)
@@ -0,0 +1,13 @@
+
+
+
+
+
+
+
+
+This package needs refactoring. MultiTokenSuggester and TokenPhraseSuggester should extend the same abstract class.
+
+
+
+
\ No newline at end of file
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalTreeExtractor.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalTreeExtractor.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/QueryGoalTreeExtractor.java (revision 0)
@@ -0,0 +1,40 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import java.util.List;
+
+/**
+ * A user session could contain multiple quests for content. For example,
+ * first the user looks for the Apache licence,
+ * spells it wrong, inspects different results,
+ * and then the user search for the author Ivan Goncharov.
+ *
+ * In the session analyzing suggester, this is called different goals.
+ *
+ * It is up to the QueryGoalTreeExtractor implementations to decide what
+ * events in a session are parts of the same goal, as we don't want to
+ * suggest the user to check out Goncharov when looking for the Apache license.
+ *
+ * @author Karl Wettin
+ * Date: Aug 1, 2006
+ * Time: 2:33:47 PM
+ */
+public interface QueryGoalTreeExtractor {
+
+ public abstract List> extractGoalRoots(QueryGoalNode session);
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Levenshtein.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Levenshtein.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Levenshtein.java (revision 0)
@@ -0,0 +1,125 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * Levenshtein edit distance.
+ * http://en.wikipedia.org/wiki/Levenshtein_distance
+ *
+ * todo: this is currenly just a copy of the TRStringDistance,
+ * todo: the thought is to make this in to an interface that TRStringDistance implements
+ * todo: but that requires a bit of refactoring.
+ */
+public final class Levenshtein extends EditDistance {
+
+ private final int n;
+ private final int[][][] cache = new int[30][][];
+
+
+ /**
+ * Optimized to run a bit faster than the static getDistance().
+ * In one benchmark times were 5.3sec using ctr vs 8.5sec w/ static method, thus 37% faster.
+ */
+ public Levenshtein(String sa) {
+ super(sa);
+ n = sa.length();
+ }
+
+
+ public final int getDistance(String query) {
+ int d[][]; // matrix
+ int cost; // cost
+
+ // Step 1
+ final char[] ta = query.toCharArray();
+ final int m = ta.length;
+ if (n == 0) {
+ return m;
+ }
+ if (m == 0) {
+ return n;
+ }
+
+ if (m >= cache.length) {
+ d = form(n, m);
+ } else if (cache[m] != null) {
+ d = cache[m];
+ } else {
+ d = cache[m] = form(n, m);
+
+ // Step 3
+
+ }
+ for (int i = 1; i <= n; i++) {
+ final char s_i = sa[i - 1];
+
+ // Step 4
+
+ for (int j = 1; j <= m; j++) {
+ final char t_j = ta[j - 1];
+
+ // Step 5
+
+ if (s_i == t_j) { // same
+ cost = 0;
+ } else { // not a match
+ cost = 1;
+
+ // Step 6
+
+ }
+ d[i][j] = min3(d[i - 1][j] + 1, d[i][j - 1] + 1, d[i - 1][j - 1] + cost);
+ }
+ }
+
+ // Step 7
+ return d[n][m];
+
+ }
+
+
+ private static int[][] form(int n, int m) {
+ int[][] d = new int[n + 1][m + 1];
+ // Step 2
+
+ for (int i = 0; i <= n; i++) {
+ d[i][0] = i;
+
+ }
+ for (int j = 0; j <= m; j++) {
+ d[0][j] = j;
+ }
+ return d;
+ }
+
+
+ //****************************
+ // Get minimum of three values
+ //****************************
+ private static int min3(int a, int b, int c) {
+ int mi = a;
+ if (b < mi) {
+ mi = b;
+ }
+ if (c < mi) {
+ mi = c;
+ }
+ return mi;
+
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/SuggestionList.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/SuggestionList.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/SuggestionList.java (revision 0)
@@ -0,0 +1,131 @@
+package org.apache.lucene.search.didyoumean.dictionary;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import com.sleepycat.persist.model.Entity;
+import com.sleepycat.persist.model.PrimaryKey;
+
+import java.io.Serializable;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import org.apache.lucene.search.didyoumean.Suggestion;
+
+/**
+ * A list of suggetions to a miss spelled word, ordered by suggestion score.
+ *
+ * @author Karl Wettin
+ * Date: 2007-feb-02
+ * Time: 06:28:23
+ *
+ */
+@Entity
+public class SuggestionList implements Iterable, Serializable {
+
+ // private static Log log = LogFactory.getLog(SuggestionsList.class);
+ private static long serialVersionUID = 1l;
+
+ @PrimaryKey
+ private String query;
+ private List suggestions = new LinkedList();
+
+ /** bdb persistency */
+ private SuggestionList(){}
+
+ SuggestionList(String query) {
+ this.query = query;
+ }
+
+ public Suggestion get(int index) {
+ return suggestions.get(index);
+ }
+
+ public Suggestion get(String suggested) {
+ for (Suggestion suggestion : suggestions) {
+ if (suggestion.getSuggested().equals(suggested)) {
+ return suggestion;
+ }
+ }
+ return null;
+ }
+
+ public Iterator iterator() {
+ return suggestions.iterator();
+ }
+
+ public int size() {
+ return suggestions.size();
+ }
+
+ public boolean containsSuggested(String suggested) {
+ for (Suggestion suggestion : suggestions) {
+ if (suggested.equals(suggestion.getSuggested())) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public void addSuggested(String suggested, double score, Integer corpusQueryResults) {
+ if (containsSuggested(suggested)) {
+ throw new IllegalArgumentException("Already contains suggested '" + suggested + "'");
+ }
+ Suggestion suggestion = new Suggestion(suggested, score, corpusQueryResults);
+ int index = Collections.binarySearch(suggestions, suggestion);
+ if (index < 0) {
+ index = (index * -1) - 1;
+ }
+ suggestions.add(index, suggestion);
+ }
+
+ public static interface SuggestionFilter {
+ public abstract boolean accept(Suggestion suggestion);
+ }
+
+ public Suggestion[] toArray(Suggestion[] arr) {
+ return suggestions.toArray(arr);
+ }
+
+ public List filter(SuggestionFilter filter) {
+ List list = new LinkedList();
+ filter(list, filter);
+ return list;
+ }
+
+ public void filter(List list, SuggestionFilter filter) {
+ for (Suggestion s : this.suggestions) {
+ if (filter.accept(s)) {
+ list.add(s);
+ }
+ }
+ }
+
+ public void sort() {
+ Collections.sort(suggestions);
+ }
+
+ public String getQuery() {
+ return query;
+ }
+
+
+ List getSuggestions() {
+ return suggestions;
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/Dictionary.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/Dictionary.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/dictionary/Dictionary.java (revision 0)
@@ -0,0 +1,235 @@
+package org.apache.lucene.search.didyoumean.dictionary;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import com.sleepycat.je.DatabaseException;
+import com.sleepycat.je.Environment;
+import com.sleepycat.je.EnvironmentConfig;
+import com.sleepycat.persist.EntityCursor;
+import com.sleepycat.persist.EntityStore;
+import com.sleepycat.persist.PrimaryIndex;
+import com.sleepycat.persist.StoreConfig;
+import org.apache.lucene.search.didyoumean.SecondLevelSuggester;
+import org.apache.lucene.search.didyoumean.Suggester;
+import org.apache.lucene.search.didyoumean.Suggestion;
+import org.apache.lucene.search.didyoumean.SuggestionPriorityQueue;
+
+import java.io.File;
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.Map;
+
+/**
+ * Essentially a dictionary with a bunch of weighted suggestions, created by a {@link org.apache.lucene.search.didyoumean.Trainer}
+ * and navigated by a {@link org.apache.lucene.search.didyoumean.Suggester}.
+ *
+ * If the dictionary does not contain a suggestion for a given query, it will be passed on to any available instance
+ * of {@link org.apache.lucene.search.didyoumean.SecondLevelSuggester} that hopefully will comes up with a suggestion.
+ *
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 2:10:19 PM
+ */
+public class Dictionary {
+
+ private EntityStore store;
+ private PrimaryIndex suggestionsByQuery;
+
+
+ public Dictionary(File bdbPath) throws DatabaseException {
+ if (!bdbPath.exists()) {
+ System.out.println("Creating path " + bdbPath);
+ bdbPath.mkdirs();
+ }
+
+ EnvironmentConfig envConfig = new EnvironmentConfig();
+ envConfig.setAllowCreate(true);
+ envConfig.setTransactional(false);
+ envConfig.setLocking(false);
+ Environment env = new Environment(bdbPath, envConfig);
+
+ StoreConfig storeConfig = new StoreConfig();
+ storeConfig.setAllowCreate(true);
+ storeConfig.setTransactional(false);
+ store = new EntityStore(env, "didYouMean", storeConfig);
+
+ suggestionsByQuery = store.getPrimaryIndex(String.class, SuggestionList.class);
+ }
+
+ public SuggestionList suggestionListFactory(String query) {
+ return new SuggestionList(keyFormatter(query));
+ }
+
+ public void close() throws DatabaseException {
+ store.close();
+ }
+
+ /**
+ * This function allows modification of the dictionary keys. Default implementation stripps them from all
+ * whitespace and punctuation, enabling features such as "allwork and no fun" suggesting "all work and no fun"
+ * That will of course require that the trainer have inserted the key to suggest to it's original self. Training
+ * everything the users type in might consume a lot of resources.
+ * {@link org.apache.lucene.search.didyoumean.impl.DefaultTrainer#setTrainingFinalGoalAsSelf(boolean)} ()}
+ *
+ * @param inputKey dictionary key to be formatted
+ * @return inputKey stripped from all whitespace and punctuation.
+ */
+ public String keyFormatter(String inputKey) {
+ return inputKey.replaceAll("\\p{Punct}", "").replaceAll("\\s", "").toLowerCase();
+ }
+
+ /**
+ * Implementation must pass query through keyformatter!
+ * @param query unformatted key
+ * @return suggestion list associated with key
+ */
+ public SuggestionList getSuggestions(String query) throws DatabaseException {
+ return suggestionsByQuery.get(keyFormatter(query));
+ }
+
+ /**
+ * This is used by the second level cache so it only comes up with suggestions that don't exist in the dictionary.
+ *
+ * @param query the original query from the user
+ * @param suggested the suggestion to the query to inspect if suggestable
+ * @return true if the suggested is a known suggetion to the query.
+ */
+ public boolean isExistingSuggestion(String query, String suggested) throws DatabaseException {
+ SuggestionList suggestions = getSuggestions(query);
+ return suggestions != null && suggestions.containsSuggested(suggested);
+ }
+
+
+ private Map prioritesBySecondLevelSuggester = new HashMap();
+
+ /**
+ * Comes up with the best suggestion from the second level suggesters.
+ * {@link org.apache.lucene.search.didyoumean.SecondLevelSuggester}
+ *
+ * This metod also adds the suggestion to the dictionary!
+ *
+ * @param query the user input
+ * @param n number of suggestions requested
+ * @return the best suggestion the second level suggesters could come up with
+ */
+ public Suggestion[] getSecondLevelSuggestion(String query, int n) throws DatabaseException {
+ Map suggestionsBySuggested = new HashMap();
+ for (Map.Entry suggester_boost : prioritesBySecondLevelSuggester.entrySet()) {
+ SuggestionPriorityQueue suggestions = suggester_boost.getKey().suggest(query);
+ for (int i = 0; i < n && suggestions.size() > i; i++) {
+ Suggestion suggestion = (Suggestion) suggestions.pop();
+ if (suggester_boost.getKey().hasPersistableSuggestions()) {
+ Suggestion internedSuggestion = suggestionsBySuggested.get(suggestion.getSuggested());
+ if (internedSuggestion == null) {
+ internedSuggestion = new Suggestion(suggestion.getSuggested(), 0d);
+ }
+ internedSuggestion.setScore(internedSuggestion.getScore() + suggester_boost.getValue());
+ suggestionsBySuggested.put(suggestion.getSuggested(), internedSuggestion);
+ }
+ }
+ }
+
+ if (suggestionsBySuggested.size() == 0) {
+ return null;
+ }
+
+ Suggestion[] suggestions = suggestionsBySuggested.values().toArray(new Suggestion[suggestionsBySuggested.size()]);
+ Arrays.sort(suggestions, new Comparator() {
+ public int compare(Suggestion suggestion, Suggestion suggestion1) {
+ return Double.compare(suggestion.getScore(), suggestion1.getScore());
+ }
+ });
+
+ // add to dictionary
+ SuggestionList suggestionList = suggestionListFactory(query);
+ suggestionList.addSuggested(suggestions[0].getSuggested(), 1d, suggestions[0].getCorpusQueryResults());
+ suggestionsByQuery.put(suggestionList);
+ return suggestions;
+ }
+
+
+ public Map getPrioritesBySecondLevelSuggester() {
+ return prioritesBySecondLevelSuggester;
+ }
+
+ public void setPrioritesBySecondLevelSuggester(Map prioritesBySecondLevelSuggester) {
+ this.prioritesBySecondLevelSuggester = prioritesBySecondLevelSuggester;
+ }
+
+ /**
+ * Used to extract bootstrapped a priori corpus from the dictionary
+ * @return a map where suggestion is key and the value is a list of misspelled words that suggests the key.
+ * @throws DatabaseException
+ * @see org.apache.lucene.search.didyoumean.SuggestionFacade#secondLevelSuggestionFactory(org.apache.lucene.index.facade.IndexFacade,org.apache.lucene.index.facade.IndexFacade)
+ */
+ public Map inverted() throws DatabaseException {
+
+ // todo use a temporary bdb for this so we dont run out of memory
+
+ Map inverted = new HashMap();
+
+ for (Map.Entry e : getSuggestionsByQuery().map().entrySet()) {
+ Suggestion s = e.getValue().get(0);
+ SuggestionList sl = inverted.get(s.getSuggested());
+ if (sl == null) {
+ sl = new SuggestionList(s.getSuggested());
+ inverted.put(s.getSuggested(), sl);
+ }
+ sl.addSuggested(e.getKey(), s.getScore(), s.getCorpusQueryResults());
+ }
+
+
+ return inverted;
+ }
+
+
+ public PrimaryIndex getSuggestionsByQuery() {
+ return suggestionsByQuery;
+ }
+
+
+ /**
+ * Scans the dictionary for queries that suggests a query
+ * that in their own turn suggest something else.
+ * These bad suggestions will be replaced by the final suggestion,
+ * so that the suggester don't have to spend clock ticks doing this in real time.
+ *
+ * @param suggester
+ */
+ public void optimize(Suggester suggester) throws DatabaseException {
+ // todo
+ }
+
+ /**
+ * Removes excess suggestions that probably never be suggested,
+ * for instance those with too great distance from top suggestion.
+ */
+ public void prune(int maxSize) throws DatabaseException {
+ EntityCursor cursor = getSuggestionsByQuery().entities();
+ SuggestionList suggestionList;
+ while ((suggestionList = cursor.next()) != null) {
+ if (suggestionList.size() > maxSize) {
+ for (int i = maxSize; i < suggestionList.size(); i++) {
+ suggestionList.getSuggestions().remove(i);
+ }
+ getSuggestionsByQuery().put(suggestionList);
+ }
+ }
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.png
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.png
===================================================================
Cannot display: file marked as a binary type.
svn:mime-type = application/octet-stream
Property changes on: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.png
___________________________________________________________________
Name: svn:mime-type
+ application/octet-stream
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.dot
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.dot (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeA.dot (revision 0)
@@ -0,0 +1,20 @@
+digraph goalTreeA
+{
+ nodeA[shape=box label="heroes of knight and magic"];
+ nodeB[shape=box label="heroes of light and magic"];
+ nodeC[shape=box label="heroes of night and magic"];
+ nodeD[shape=box label="heroes of might and magic"];
+ nodeE[shape=box label="homm"];
+ nodeA->nodeB
+ nodeA->nodeC
+ nodeC->nodeD
+ nodeD->nodeE
+ inspectionD1[label="inspected 1st result"];
+ inspectionD2[label="commented 3rd result"];
+ inspectionD1->nodeD;
+ inspectionD2->nodeD;
+ inspectionE1[label="inspected 1st result"];
+ inspectionE2[label="inspected 2nd result"];
+ inspectionE1->nodeE;
+ inspectionE2->nodeE;
+}
\ No newline at end of file
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.dot
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.dot (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/goalTreeB.dot (revision 0)
@@ -0,0 +1,33 @@
+digraph goalTreeA
+{
+ nodeA[shape=box label="heroes of knight and magic" color="red"];
+ nodeB[shape=box label="heroes of light and magic" color="red"];
+ nodeC[shape=box label="heroes of night and magic" color="red"];
+ nodeD[shape=box label="heroes of might and magic" color="green"];
+ nodeE[shape=box label="homm" color="green"];
+ nodeA->nodeB[label="edit distance"];
+ nodeA->nodeC[label="edit distance"];
+ nodeC->nodeD[label="edit distance"];
+ nodeD->nodeE[label="short time"];
+
+ inspectionD1[label="inspected 1st result"];
+ inspectionD2[label="commented 3rd result"];
+ inspectionD1->nodeD
+ inspectionD2->nodeD;
+ inspectionE1[label="inspected 1st result"];
+ inspectionE2[label="inspected 2nd result"];
+ inspectionE1->nodeE;
+ inspectionE2->nodeE;
+
+ suggestionA[label="heroes of might and magic" color="yellow"];
+ nodeA->suggestionA[label="adapt++"];
+ suggestionB[label="heroes of might and magic" color="yellow"];
+ nodeB->suggestionB[label="adapt++"];
+ suggestionC[label="heroes of might and magic" color="yellow"];
+ nodeC->suggestionC[label="adapt++"];
+ suggestionD[label="homm" color="yellow"];
+ nodeD->suggestionD[label="adapt++"];
+ suggestionE[label="heroes of might and magic" color="yellow"];
+ nodeE->suggestionE[label="adapt++"];
+
+}
\ No newline at end of file
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/classdiagram.uxf
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/classdiagram.uxf (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/doc-files/classdiagram.uxf (revision 0)
@@ -0,0 +1,489 @@
+
+
+
+ com.umlet.element.base.Class
+
+ 780
+ 290
+ 200
+ 90
+
+ QueryGoalNode<R>
+ --
+ +query:String
+ +timeStamp:long
+ +suggestions:Suggestion[]
+ --
+
+
+
+
+
+ com.umlet.element.base.Class
+
+ 760
+ 0
+ 230
+ 160
+
+ QueryGoalNode<R>.Inspection
+ --
+ +reference:R
+ +timeStamp:long
+ _+NOT_GOAL:Double=-1d_
+ _+NEUTRAL:Double=0d_
+ _+GOAL:Double=1d_
+ _+MOO:Double=null _
+ +classification:Double
+ --
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 813
+ 140
+ 134
+ 170
+
+ lt=<<<<->
+ m2=0..*
+ <<ordered>>
+ 67;150;67;20
+
+
+ com.umlet.element.base.Relation
+
+ 860
+ 310
+ 277
+ 180
+
+ lt=<<<-
+ m2=0..*
+ r2=child
+ m1=1
+ r1=parent
+ <<ordered>>
+ 120;20;210;20;210;160;20;160;20;70
+
+
+ com.umlet.element.base.Relation
+
+ 650
+ 360
+ 170
+ 120
+
+ lt=<.
+ <<process>>
+ 150;20;20;100
+
+
+ com.umlet.element.base.Class
+
+ 140
+ 790
+ 160
+ 30
+
+ DefaultSuggester<R>
+
+
+
+ com.umlet.element.base.Relation
+
+ 540
+ 280
+ 40
+ 170
+
+ lt=<<.
+ 20;150;20;20
+
+
+ com.umlet.element.base.Class
+
+ 400
+ 270
+ 280
+ 30
+
+ DefaultQueryGoalTreeExtractor<R>
+
+
+
+
+ com.umlet.element.base.Note
+
+ 160
+ 600
+ 220
+ 30
+
+ The consumer level interface
+ bg=blue
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 540
+ 460
+ 40
+ 160
+
+ lt=->
+ r2=1
+
+ 20;140;20;20
+
+
+ com.umlet.element.base.Relation
+
+ 540
+ 610
+ 40
+ 180
+
+ lt=->
+ r2=1
+
+ 20;20;20;160
+
+
+ com.umlet.element.base.Class
+
+ 440
+ 430
+ 230
+ 50
+
+ <<interface>>
+ QueryGoalTreeExtractor<R>
+
+
+
+
+ com.umlet.element.base.Class
+
+ 480
+ 600
+ 150
+ 30
+
+ SuggestionsFacade
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 610
+ 590
+ 190
+ 40
+
+ lt=->
+ r2=1
+
+ 20;20;170;20
+
+
+ com.umlet.element.base.Class
+
+ 420
+ 770
+ 280
+ 80
+
+ <<interface>>
+ Suggester<R>
+ --
+ +didYouMean(String, int):Suggestion[]
+ +didYouMean(String):String
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 980
+ 600
+ 180
+ 40
+
+ lt=<<.
+ 20;20;160;20
+
+
+ com.umlet.element.base.Relation
+
+ 832
+ 640
+ 136
+ 170
+
+ lt=<.
+ <<updates>>
+ 68;150;68;20
+
+
+ com.umlet.element.base.Class
+
+ 780
+ 570
+ 220
+ 90
+
+ <<interface>>
+ Trainer<R>
+ --
+ +queue(QueryGoalNode<R>)
+ +flush(Dictionary);
+
+
+
+
+ com.umlet.element.base.Class
+
+ 1140
+ 610
+ 150
+ 30
+
+ DefaultTrainer<R>
+
+
+
+ com.umlet.element.base.Class
+
+ 1170
+ 770
+ 170
+ 70
+
+ Suggestion
+ --
+ +suggested:String
+ +score:double
+ --
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 920
+ 776
+ 270
+ 54
+
+ lt=<<<<->
+ q1=key
+ m2=0..*
+ <<ordered>>
+ 20;34;250;34
+
+
+ com.umlet.element.base.Note
+
+ 350
+ 1270
+ 280
+ 30
+
+ Basically the old contrib/spell checker
+ bg=blue
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 610
+ 1270
+ 210
+ 40
+
+ lt=.
+ 190;20;20;20
+
+
+ com.umlet.element.base.Class
+
+ 800
+ 1270
+ 190
+ 30
+
+ NgramTokenSuggester
+
+
+
+ com.umlet.element.base.Class
+
+ 840
+ 1170
+ 120
+ 30
+
+ TokenSuggester
+
+
+
+ com.umlet.element.base.Class
+
+ 810
+ 1070
+ 180
+ 30
+
+ TokenPhraseSuggester
+
+
+
+ com.umlet.element.base.Relation
+
+ 880
+ 980
+ 40
+ 110
+
+ lt=<<.
+ 20;20;20;90
+
+
+ com.umlet.element.base.Relation
+
+ 880
+ 1080
+ 40
+ 110
+
+ lt=->
+ m2=1
+
+ 20;20;20;90
+
+
+ com.umlet.element.base.Relation
+
+ 360
+ 590
+ 140
+ 40
+
+ lt=.
+
+ 120;20;20;20
+
+
+ com.umlet.element.base.Relation
+
+ 880
+ 1180
+ 40
+ 110
+
+ lt=<<-
+ 20;20;20;90
+
+
+ com.umlet.element.base.Class
+
+ 760
+ 930
+ 280
+ 70
+
+ <<interface>>
+ SecondLevelSuggester
+ --
+ +didYouMean(String, int):Suggestion[]
+
+
+
+
+ com.umlet.element.base.Relation
+
+ 880
+ 800
+ 40
+ 150
+
+ lt=<<<<->
+ m2=0..*
+
+ 20;20;20;130
+
+
+ com.umlet.element.base.Relation
+
+ 680
+ 776
+ 190
+ 54
+
+ lt=<.
+ <<navigates>>
+ 170;34;20;34
+
+
+ com.umlet.element.base.Relation
+
+ 280
+ 780
+ 160
+ 40
+
+ lt=<<.
+ 140;20;20;20
+
+
+ com.umlet.element.base.Relation
+
+ 380
+ 820
+ 50
+ 40
+
+ lt=<<.
+ -80;-10;-80;-10
+
+
+ com.umlet.element.base.Relation
+
+ 880
+ 860
+ 110
+ 40
+
+ lt=.
+ 20;20;90;20
+
+
+ com.umlet.element.base.Class
+
+ 970
+ 860
+ 140
+ 30
+
+ _weight:double_
+
+
+
+ com.umlet.element.base.Class
+
+ 850
+ 790
+ 90
+ 30
+
+ Dictionary
+
+
+
\ No newline at end of file
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggestion.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggestion.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/Suggestion.java (revision 0)
@@ -0,0 +1,106 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import com.sleepycat.persist.model.Entity;
+import com.sleepycat.persist.model.Persistent;
+
+import java.io.Serializable;
+import java.text.DecimalFormat;
+
+/**
+ * A scored string value, a suggestion in a dictionary.
+ *
+ * @author Karl Wettin
+ * Date: Jul 31, 2006
+ * Time: 4:59:06 PM
+ *
+ */
+@Persistent
+public class Suggestion implements Comparable {
+
+ private String suggested;
+ private double score = 1d;
+ private Integer corpusQueryResults;
+
+
+ public Suggestion() {
+ }
+
+ public Suggestion(String suggested) {
+ this.suggested = suggested;
+ }
+
+
+ public Suggestion(String suggested, double score) {
+ this.suggested = suggested;
+ this.score = score;
+ }
+
+
+ public Suggestion(String suggested, double score, Integer corpusQueryResults) {
+ this.suggested = suggested;
+ this.score = score;
+ this.corpusQueryResults = corpusQueryResults;
+ }
+
+ /**
+ *
+ * @param suggestion suggestion this instance to be compared to
+ * @return the suggestion scores compared.
+ */
+ public int compareTo(Suggestion suggestion) {
+ return Double.compare(suggestion.getScore(), getScore());
+ }
+
+
+ public Integer getCorpusQueryResults() {
+ return corpusQueryResults;
+ }
+
+ public void setCorpusQueryResults(Integer corpusQueryResults) {
+ this.corpusQueryResults = corpusQueryResults;
+ }
+
+ public String getSuggested() {
+ return suggested;
+ }
+
+ public void setSuggested(String suggested) {
+ this.suggested = suggested;
+ }
+
+ public double getScore() {
+ return score;
+ }
+
+ public void setScore(double score) {
+ this.score = score;
+ }
+
+ private static final DecimalFormat df = new DecimalFormat("#.##");
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(df.format(getScore()));
+ sb.append('\t');
+ sb.append(getSuggested());
+ return sb.toString();
+ }
+
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AprioriCorpusFactory.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AprioriCorpusFactory.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AprioriCorpusFactory.java (revision 0)
@@ -0,0 +1,47 @@
+package org.apache.lucene.search.didyoumean;
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.facade.IndexFacade;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * Extracts an a priori corpus from a dictionary.
+ *
+ * @author karl wettin
+ * Date: 2007-okt-19
+ * Time: 02:07:34
+ */
+public interface AprioriCorpusFactory {
+
+ /**
+ * Initializes and populates an a priori index with documents whos text is known to be correct.
+ *
+ * @param dictionary dictionary to extract data from.
+ * @param suggester suggester used to navigate the dictionary.
+ * @param aprioriIndex lucene index, will be created/reinitialized.
+ * @param aprioriIndexField index field used to store the a priori text.
+ * @param aprioriAnalyzer analyzer used to tokenize a priori text.
+ * @throws IOException
+ * @throws DatabaseException
+ */
+ public abstract void factory(Dictionary dictionary, Suggester suggester, IndexFacade aprioriIndex, String aprioriIndexField, Analyzer aprioriAnalyzer) throws IOException, DatabaseException;
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SecondLevelSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SecondLevelSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SecondLevelSuggester.java (revision 0)
@@ -0,0 +1,35 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * When the dictionary is not aware of any suggestion the second level
+ * suggesters are called upon to find the best suggestions and insert
+ * them to the dictionary.
+ *
+ * @author Karl Wettin
+ * Date: 2007-jan-30
+ * Time: 06:05:54
+ */
+public interface SecondLevelSuggester {
+ public abstract SuggestionPriorityQueue suggest(String query);
+
+ /**
+ * @return true if suggestions from this suggester should be persisted in the dictionary
+ */
+ public abstract boolean hasPersistableSuggestions();
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AbstractSuggester.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AbstractSuggester.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/AbstractSuggester.java (revision 0)
@@ -0,0 +1,40 @@
+package org.apache.lucene.search.didyoumean;
+
+import com.sleepycat.je.DatabaseException;
+import org.apache.lucene.search.didyoumean.dictionary.Dictionary;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * Abstract suggester.
+ *
+ * @author Karl Wettin
+ * Date: 2007-feb-17
+ * Time: 02:10:42
+ */
+public abstract class AbstractSuggester implements Suggester {
+
+ public String didYouMean(Dictionary dictionary, String query) throws DatabaseException {
+ Suggestion[] suggestions = didYouMean(dictionary, query, 1);
+ if (suggestions == null || suggestions.length == 0) {
+ return null;
+ } else {
+ return suggestions[0].getSuggested();
+ }
+ }
+
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionPriorityQueue.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionPriorityQueue.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/search/didyoumean/SuggestionPriorityQueue.java (revision 0)
@@ -0,0 +1,50 @@
+package org.apache.lucene.search.didyoumean;
+
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+import org.apache.lucene.util.PriorityQueue;
+
+import java.util.LinkedList;
+
+/**
+ * User: kalle
+ * Date: 2007-mar-03
+ * Time: 13:14:38
+ */
+public class SuggestionPriorityQueue extends PriorityQueue {
+
+ public SuggestionPriorityQueue(int maxSize) {
+ initialize(maxSize);
+ }
+
+
+ protected boolean lessThan(Object a, Object b) {
+ Suggestion s0 = (Suggestion) a;
+ Suggestion s1 = (Suggestion) b;
+ return s0.compareTo(s1) == -1;
+ }
+
+ public Suggestion[] toArray() {
+ LinkedList list = new LinkedList();
+ Suggestion s;
+ while ((s = (Suggestion) pop()) != null) {
+ list.add(s);
+ }
+ return list.toArray(new Suggestion[list.size()]);
+ }
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexFacade.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexFacade.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexFacade.java (revision 0)
@@ -0,0 +1,44 @@
+package org.apache.lucene.index.facade;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-aug-17
+ * Time: 00:17:14
+ */
+public class DirectoryIndexFacade extends IndexFacade {
+
+ private Directory directory;
+
+ public DirectoryIndexFacade(Directory directory) throws IOException {
+ this.directory = directory;
+ }
+
+ public IndexReader indexReaderFactory() throws IOException {
+ return IndexReader.open(directory);
+ }
+
+ public IndexWriterFacade indexWriterFactory(Analyzer analyzer, boolean create) throws IOException {
+ return new DirectoryIndexWriterFacade(directory, analyzer, create);
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacadeFactory.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacadeFactory.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacadeFactory.java (revision 0)
@@ -0,0 +1,29 @@
+package org.apache.lucene.index.facade;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-okt-23
+ * Time: 20:37:44
+ */
+public interface IndexFacadeFactory {
+
+ public abstract IndexFacade factory() throws IOException;
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/InstantiatedIndexFacade.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/index/facade/InstantiatedIndexFacade.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/InstantiatedIndexFacade.java (revision 0)
@@ -0,0 +1,66 @@
+package org.apache.lucene.index.facade;
+
+import org.apache.lucene.store.instantiated.InstantiatedIndex;
+import org.apache.lucene.store.instantiated.InstantiatedIndexWriter;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-aug-17
+ * Time: 03:51:57
+ */
+public class InstantiatedIndexFacade
+ extends IndexFacade {
+
+ private InstantiatedIndex ii;
+
+ public InstantiatedIndexFacade(InstantiatedIndex ii) {
+ this.ii = ii;
+ }
+
+ public IndexReader indexReaderFactory() throws IOException {
+ return ii.indexReaderFactory();
+ }
+
+ public IndexWriterFacade indexWriterFactory(final Analyzer analyzer, final boolean create) throws IOException {
+ return new IndexWriterFacade() {
+
+ private InstantiatedIndexWriter iw = ii.indexWriterFactory(analyzer, create);
+
+ public void addDocument(Document document, Analyzer analyzer) throws IOException {
+ iw.addDocument(document, analyzer);
+ }
+
+ public void addDocument(Document document) throws IOException {
+ iw.addDocument(document);
+ }
+
+ public void close() throws IOException {
+ iw.close();
+ }
+
+ public void optimize() throws IOException {
+
+ }
+ };
+ }
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexWriterFacade.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexWriterFacade.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexWriterFacade.java (revision 0)
@@ -0,0 +1,38 @@
+package org.apache.lucene.index.facade;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-aug-16
+ * Time: 23:23:39
+ */
+public abstract class IndexWriterFacade {
+
+ public abstract void addDocument(Document document, Analyzer analyzer) throws IOException;
+
+ public abstract void addDocument(Document document) throws IOException;
+
+ public abstract void close() throws IOException;
+
+ public abstract void optimize() throws IOException;
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacade.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacade.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/IndexFacade.java (revision 0)
@@ -0,0 +1,34 @@
+package org.apache.lucene.index.facade;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.IndexReader;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-aug-16
+ * Time: 23:23:22
+ */
+public abstract class IndexFacade {
+
+ public abstract IndexReader indexReaderFactory() throws IOException;
+
+ public abstract IndexWriterFacade indexWriterFactory(Analyzer analyzer, boolean create) throws IOException;
+
+}
Index: contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexWriterFacade.java
===================================================================
--- contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexWriterFacade.java (revision 0)
+++ contrib/didyoumean/src/java/org/apache/lucene/index/facade/DirectoryIndexWriterFacade.java (revision 0)
@@ -0,0 +1,53 @@
+package org.apache.lucene.index.facade;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.index.IndexWriter;
+
+import java.io.IOException;
+/*
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+
+/**
+ * @author karl wettin
+ * Date: 2007-aug-17
+ * Time: 00:18:25
+ */
+public class DirectoryIndexWriterFacade extends IndexWriterFacade {
+
+ private IndexWriter indexWriter;
+
+ public DirectoryIndexWriterFacade(Directory directory, Analyzer analyzer, boolean create) throws IOException {
+ indexWriter = new IndexWriter(directory, analyzer, create);
+ }
+
+ public void addDocument(Document document, Analyzer analyzer) throws IOException {
+ indexWriter.addDocument(document, analyzer);
+ }
+
+ public void addDocument(Document document) throws IOException {
+ indexWriter.addDocument(document);
+ }
+
+ public void close() throws IOException {
+ indexWriter.close();
+ }
+
+ public void optimize() throws IOException {
+ indexWriter.optimize();
+ }
+}
Index: contrib/didyoumean/build.xml
===================================================================
--- contrib/didyoumean/build.xml (revision 0)
+++ contrib/didyoumean/build.xml (revision 0)
@@ -0,0 +1,42 @@
+
+
+
+
+
+
+
+ Extended spell checker with phrase support and adaptive user session analysis.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+