ΠΡΠΏΠΎΠ»ΡΠ·ΡΡ ΠΏΠΎΠ΄ΡΠΊΠ°Π·ΠΊΠΈ Π² ΠΏΡΠΎΡΠ΅ΡΡΠ΅ Π²Π²ΠΎΠ΄Π° ΠΏΠΎΠΈΡΠΊΠΎΠ²ΠΎΠ³ΠΎ Π·Π°ΠΏΡΠΎΡΠ° ΠΌΠΎΠΆΠ½ΠΎ Π±ΡΡΡΡΠ΅Π΅ ΠΈΡΠΊΠ°ΡΡ Π½ΡΠΆΠ½ΡΡ ΠΈΠ½ΡΠΎΡΠΌΠ°ΡΠΈΡ. ΠΠΎΠΈΡΠΊΠΎΠ²ΡΠ΅ ΠΏΠΎΠ΄ΡΠΊΠ°Π·ΠΊΠΈ - ΡΡΠΎ ΡΡ
ΠΎΠΆΠΈΠ΅ Ρ Π²Π°ΡΠΈΠΌ Π·Π°ΠΏΡΠΎΡΠΎΠΌ ΡΠ΅ΡΠΌΠΈΠ½Ρ. ΠΠ½ΠΈ ΠΏΠΎΡΠ²Π»ΡΡΡΡΡ Π½Π° ΡΠΊΡΠ°Π½Π΅ ΠΏΠΎ ΠΌΠ΅ΡΠ΅ ΡΠΎΠ³ΠΎ, ΠΊΠ°ΠΊ Π²Ρ Π²Π²ΠΎΠ΄ΠΈΡΠ΅ Π² ΡΡΡΠΎΠΊΡ ΠΏΠΎΠΈΡΠΊΠ° Π·Π°ΠΏΡΠΎΡ. ΠΠ»Ρ Π΄Π°Π½Π½ΠΎΠ³ΠΎ ΠΏΡΠΎΡΠ΅ΡΡΠ° ΠΊΡΠΈΡΠΈΡΠ΅ΡΠΊΠΈ Π²Π°ΠΆΠ½Π° ΡΠΊΠΎΡΠΎΡΡΡ ΡΠ΅Π°ΠΊΡΠΈΠΈ Π½Π° Π²Π²ΠΎΠ΄ ΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°ΡΠ΅Π»Ρ, ΠΏΠΎΡΡΠΎΠΌΡ Π½Π° ΡΠ΅ΡΠ²Π΅ΡΠ½ΠΎΠΉ ΡΠ°ΡΡΠΈ ΡΡΠΊΠΎΡΠΈΡΡ ΠΏΡΠΎΡΠ΅ΡΡ ΠΏΠΎΠΌΠΎΠΆΠ΅Ρ ΡΠΏΠ΅ΡΠΈΠ°Π»ΡΠ½ΠΎ Π·Π°ΡΠΎΡΠ΅Π½Π½ΡΠΉ ΠΈΠ½Π΄Π΅ΠΊΡ / ΠΌΠΎΠ΄Π΅Π»Ρ Π΄Π°Π½Π½ΡΡ
, Π° Π½Π° ΡΡΠΎΡΠΎΠ½Π΅ ΠΊΠ»ΠΈΠ΅Π½ΡΠ° Π½Π΅ΠΏΠ»ΠΎΡ
ΠΎ Π±Ρ ΠΈΡΠΏΠΎΠ»ΡΠ·ΠΎΠ²Π°ΡΡ AJAX Π΄Π°Π±Ρ Π½Π΅ ΠΏΠ΅ΡΠ΅Π·Π°Π³ΡΡΠΆΠ°ΡΡ ΠΊΠ°ΠΆΠ΄ΡΠΉ ΡΠ°Π· HTML-ΡΡΡΠ°Π½ΠΈΡΡ ΡΠ΅Π»ΠΈΠΊΠΎΠΌ ΠΏΡΠΈ ΠΈΠ·ΠΌΠ΅Π½Π΅Π½ΠΈΠΈ ΠΏΠΎΠΈΡΠΊΠΎΠ²ΠΎΠ³ΠΎ Π·Π°ΠΏΡΠΎΡΠ°. ΠΠ° ΠΊΠ°ΡΡΠΈΠ½ΠΊΠ΅ ΠΏΠΎΠΊΠ°Π·Π°Π½ ΡΠ΅Π·ΡΠ»ΡΡΠ°Ρ ΡΠ°Π±ΠΎΡΡ Π°Π»Π³ΠΎΡΠΈΡΠΌΠ° AnalyzingSuggester, ΠΊΠΎΡΠΎΡΡΠΉ ΠΌΠΎΠΆΠ½ΠΎ Π·Π°ΠΏΡΡΡΠΈΡΡ ΡΠΊΠ°ΡΠ°Π² ΠΈΡΡ
ΠΎΠ΄Π½ΠΈΠΊΠΈ Π² ΠΊΠΎΠ½ΡΠ΅ ΡΠ΅ΠΊΡΡΠ΅Π³ΠΎ ΠΌΠ°ΡΠ΅ΡΠΈΠ°Π»Π°.
ΠΠ΅ΡΠ²ΡΠΌ Π΄Π΅Π»ΠΎΠΌ Π½ΡΠΆΠ½ΠΎ Π΄ΠΎΠ±Π°Π²ΠΈΡΡ Π² ΠΌΡΠ»ΡΡΠΈ-ΠΌΠΎΠ΄ΡΠ»ΡΠ½ΡΠΉ maven ΠΏΡΠΎΠ΅ΠΊΡ Π½ΠΎΠ²ΡΡ Π·Π°Π²ΠΈΡΠΈΠΌΠΎΡΡΡ lucene-suggest
:
src/pom.xml
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
<modelVersion>4.0.0</modelVersion>
<groupId>tutorial.lucene</groupId>
<artifactId>parent</artifactId>
<packaging>pom</packaging>
<version>1.0</version>
<properties>
<lucene.version>6.0.0</lucene.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-core</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-analyzers-common</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-facet</artifactId>
<version>${lucene.version}</version>
</dependency>
<dependency>
<groupId>org.apache.lucene</groupId>
<artifactId>lucene-suggest</artifactId>
<version>${lucene.version}</version>
</dependency>
</dependencies>
<modules>
<module>server</module>
<module>crawler</module>
<module>common</module>
</modules>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<version>3.5.1</version>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
Π‘ ΠΏΠΎΠΌΠΎΡΡΡ ΠΌΠ΅ΡΠΎΠ΄ΠΎΠ² store
/ load
Π² ΠΊΠ»Π°ΡΡΠ΅ AnalyzingSuggester ΠΌΠΎΠΆΠ½ΠΎ ΡΠΎΡ
ΡΠ°Π½ΠΈΡΡ / Π²ΠΎΡΡΡΠ°Π½Π°Π²Π»ΠΈΠ²Π°ΡΡ ΠΌΠΎΠ΄Π΅Π»Ρ Π΄Π°Π½Π½ΡΡ
Π°Π²ΡΠΎΠ΄ΠΎΠΏΠΎΠ»Π½Π΅Π½ΠΈΡ Π² ΡΠ°ΠΉΠ» Π΄Π»Ρ ΠΏΠΎΡΠ»Π΅Π΄ΡΡΡΠ΅Π³ΠΎ Π±ΡΡΡΡΠΎΠ³ΠΎ ΡΡΠ°ΡΡΠ°. ΠΠΎΡΠΊΠΎΠ»ΡΠΊΡ ΡΡΠ° Π»ΠΎΠ³ΠΈΠΊΠ° ΠΈΡΠΏΠΎΠ»ΡΠ·ΡΠ΅ΡΡΡ ΠΊΠ°ΠΊ ΠΏΡΠΈ ΠΈΠ½Π΄Π΅ΠΊΡΠ°ΡΠΈΠΈ ΡΠ°ΠΊ ΠΈ ΠΏΡΠΈ ΠΏΠΎΠΈΡΠΊΠ΅, ΠΎΠ½Π° Π²ΡΠ½Π΅ΡΠ΅Π½Π° Π² ΠΈΡ
ΠΎΠ±ΡΠΈΠΉ ΠΌΠΎΠ΄ΡΠ»Ρ LuceneBinding
. Π‘ ΠΏΠΎΠΌΠΎΡΡΡ ΠΌΠ΅ΡΠΎΠ΄Π° weight()
Π·Π°Π΄Π°ΡΡΡΡ ΠΏΡΠΈΠΎΡΠΈΡΠ΅Ρ Π΄Π»Ρ ΡΡΠ°Π· Π°Π²ΡΠΎΠ΄ΠΎΠΏΠΎΠ»Π½Π΅Π½ΠΈΡ - Π² Π΄Π°Π½Π½ΠΎΠΌ ΡΠ»ΡΡΠ°Π΅ Π½Π°ΠΈΠ±ΠΎΠ»Π΅Π΅ ΠΊΠΎΡΠΎΡΠΊΠΈΠ΅ ΡΡΠ°Π·Ρ Π±ΡΠ΄ΡΡ ΠΏΠ΅ΡΠ²ΡΠΌΠΈ Π² ΡΠΏΠΈΡΠΊΠ΅:
src/common/src/main/java/common/LuceneBinding.java
package common;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.Set;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.facet.FacetsConfig;
import org.apache.lucene.search.spell.Dictionary;
import org.apache.lucene.search.suggest.InputIterator;
import org.apache.lucene.search.suggest.analyzing.AnalyzingSuggester;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
/* This class is used both by Crawler and SearchServlet */
public final class LuceneBinding {
private static final Path ROOT = Paths.get(System.getProperty("user.home"), "lucene-tutorial-index");
public static final Path SEARCH_INDEX_PATH = LuceneBinding.ROOT.resolve("search");
public static final Path TAXO_INDEX_PATH = LuceneBinding.ROOT.resolve("taxo");
public static final Path SUGGEST_INDEX_PATH = LuceneBinding.ROOT.resolve("suggest");
public static final String FIELD_ID = "id";
public static final String FIELD_TITLE = "title";
public static final String FIELD_CONTENT = "content";
public static final String FIELD_CATEGORY = "category";
public static final String FIELD_DIRECTOR = "director";
public static final String FIELD_RATE = "rate";
public static final String FACET_DIRECTOR = "Director";
public static final String FACET_DATE = "Release Date";
public static final String FACET_CATEGORY = "Category";
public static final int SUGGEST_MAX_SHINGLES = 5;
public static FacetsConfig getFacetsConfig() {
final FacetsConfig config = new FacetsConfig();
config.setHierarchical(LuceneBinding.FACET_DATE, true);
config.setMultiValued(LuceneBinding.FACET_CATEGORY, true);
return config;
}
public static Analyzer getAnalyzer() {
return new StandardAnalyzer();
}
public static final class Suggester extends AnalyzingSuggester {
private static final Path ser = LuceneBinding.SUGGEST_INDEX_PATH.resolve("serialized");
private Suggester() throws IOException {
super(FSDirectory.open(LuceneBinding.SUGGEST_INDEX_PATH.resolve("tmp")),
"", LuceneBinding.getAnalyzer());
}
public static void store(final Dictionary input) throws IOException {
final Suggester suggester = new Suggester();
suggester.build(new InputIterator() {
final InputIterator i = input.getEntryIterator();
private BytesRef c;
final int MAX_TOKEN_LENGTH = LuceneBinding.SUGGEST_MAX_SHINGLES
* (1 + StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
@Override
public BytesRef next() throws IOException {
this.c = this.i.next();
return this.c;
}
@Override
public long weight() {
// short phrases more important
return this.MAX_TOKEN_LENGTH - this.c.length;
}
@Override
public BytesRef payload() {
return this.i.payload();
}
@Override
public boolean hasPayloads() {
return this.i.hasPayloads();
}
@Override
public Set<BytesRef> contexts() {
return this.i.contexts();
}
@Override
public boolean hasContexts() {
return this.i.hasContexts();
}
});
try (final OutputStream os = Files.newOutputStream(Suggester.ser)) {
suggester.store(os);
}
}
public static Suggester load() throws IOException {
final Suggester suggester = new Suggester();
try (final InputStream is = Files.newInputStream(Suggester.ser)) {
suggester.load(is);
}
return suggester;
}
}
}
ΠΠ½Π΄Π΅ΠΊΡ Π΄Π»Ρ Π°Π²ΡΠΎΠ΄ΠΎΠΏΠΎΠ»Π½Π΅Π½ΠΈΡ Π±ΡΠ΄Π΅Ρ ΡΡΡΠΎΠΈΡΡΡ ΠΏΠΎ Π΄Π²ΡΠΌ ΠΏΠΎΠ»ΡΠΌ suggestIndexer.add(title, description)
. ΠΠΎΠΌΠ±ΠΈΠ½Π°ΡΠΈΠΈ ΡΠ»ΠΎΠ² Π΄ΠΎΡΡΠΈΠ³Π°ΡΡΡΡ Ρ ΠΏΠΎΠΌΠΎΡΡΡ ShingleAnalyzerWrapper:
src/crawler/src/main/java/crawler/SugestIndexer.java
package crawler;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Path;
import org.apache.lucene.analysis.shingle.ShingleAnalyzerWrapper;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.IndexWriterConfig.OpenMode;
import org.apache.lucene.search.spell.LuceneDictionary;
import org.apache.lucene.store.FSDirectory;
import common.LuceneBinding;
import common.LuceneBinding.Suggester;
public class SugestIndexer implements Closeable {
private final IndexWriter writer;
public static final String FIELD_SUGGEST = "SUGGEST";
private static final Path SHINGLES_INDEX_PATH = LuceneBinding.SUGGEST_INDEX_PATH.resolve("shingles");
public SugestIndexer() throws IOException {
final IndexWriterConfig iwConfig = new IndexWriterConfig(
new ShingleAnalyzerWrapper(LuceneBinding.getAnalyzer(), LuceneBinding.SUGGEST_MAX_SHINGLES));
iwConfig.setOpenMode(OpenMode.CREATE);
this.writer = new IndexWriter(FSDirectory.open(SugestIndexer.SHINGLES_INDEX_PATH), iwConfig);
}
public void add(final String... items) throws IOException {
final Document doc = new Document();
for (final String item : items) {
doc.add(new TextField(SugestIndexer.FIELD_SUGGEST, item, Store.NO));
}
this.writer.addDocument(doc);
}
@Override
public void close() throws IOException {
this.writer.close();
try (final IndexReader reader = DirectoryReader.open(this.writer.getDirectory())) {
Suggester.store(new LuceneDictionary(reader, SugestIndexer.FIELD_SUGGEST));
}
// TODO: we can remove SHINGLES_INDEX_PATH directory now
}
}
Π‘Π΅ΡΠ²Π»Π΅Ρ ΠΎΡΠ΄Π°ΡΡ Π²ΡΠ΅ Π²Π°ΡΠΈΠ°Π½ΡΡ Π°Π²ΡΠΎΠ΄ΠΎΠΏΠΎΠ»Π½Π΅Π½ΠΈΠΉ Π² JSON-ΡΠΎΡΠΌΠ°ΡΠ΅ http://localhost:8080/api/suggest?q=ΠΊΠΎΡΠΌΠΎΡ
:
src/server/src/main/java/server/SuggestServlet.java
package server;
import java.io.IOException;
import java.util.AbstractMap.SimpleEntry;
import java.util.ArrayList;
import java.util.Collection;
import java.util.HashMap;
import java.util.Map;
import java.util.stream.Collectors;
import java.util.stream.Stream;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import org.json.JSONArray;
import common.LuceneBinding.Suggester;
public class SuggestServlet extends HttpServlet {
private static volatile Suggester suggester;
private static int MAX_SUGGESTS = 22;
@Override
public void doGet(final HttpServletRequest req, final HttpServletResponse res)
throws ServletException, IOException {
if (SuggestServlet.suggester == null) {
synchronized (SuggestServlet.class) {
if (SuggestServlet.suggester == null) {
SuggestServlet.suggester = Suggester.load();
}
}
}
final String query = req.getParameter("q");
final Collection<?> lookupResultList = SuggestServlet.suggester
.lookup(query, false, SuggestServlet.MAX_SUGGESTS).stream()
.sorted((e1, e2) -> Long.compare(e2.value, e1.value))
.map(v -> Stream
.of(new SimpleEntry<>("key", v.key), new SimpleEntry<>("hi", v.highlightKey),
new SimpleEntry<>("val", v.value))
.collect(HashMap::new, (m, e) -> m.put(e.getKey(), e.getValue()), Map::putAll))
.collect(Collectors.toCollection(ArrayList::new));
final JSONArray json = new JSONArray(lookupResultList);
res.setContentType("application/json");
res.setCharacterEncoding("UTF-8");
res.getWriter().write(json.toString());
}
}
Π‘Π±ΠΎΡΠΊΠ° ΠΈ Π·Π°ΠΏΡΡΠΊ:
~$ cd src
~$ mvn clean install
~$ mvn -pl crawler/ exec:java -Dexec.mainClass="crawler.App"
~$ mvn -pl server/ jetty:run
ΠΡΡ
ΠΎΠ΄Π½ΠΈΠΊΠΈ
ElasticSearch
There are comments.