问题描述
我看过 this、this 和 this,但我不确定为什么它们对我不起作用。
我通常会使用如下所示的分析器。
import lucene
from org.apache.lucene.analysis.core import WhitespaceAnalyzer
from org.apache.lucene.index import IndexWriterConfig,IndexWriter
from org.apache.lucene.store import SimpleFSDirectory
from java.nio.file import Paths
from org.apache.lucene.document import Document,Field,TextField
index_path = "./index"
lucene.initVM()
analyzer = WhitespaceAnalyzer()
config = IndexWriterConfig(analyzer)
store = SimpleFSDirectory(Paths.get(index_path))
writer = IndexWriter(store,config)
doc = Document()
doc.add(Field("title","The quick brown fox.",TextField.TYPE_STORED))
writer.addDocument(doc)
writer.close()
store.close()
我想使用 WhitespaceAnalyzer()
而不是 MyAnalyzer()
,它应该有 LowerCaseFilter
和 Whitespacetokenizer
。
from org.apache.lucene.analysis.core import LowerCaseFilter,Whitespacetokenizer
from org.apache.pylucene.analysis import PythonAnalyzer
class MyAnalyzer(PythonAnalyzer):
def __init__(self):
PythonAnalyzer.__init__(self)
def createComponents(self,fieldName):
# What do I write here?
你能帮我编写和使用MyAnalyzer()
吗?
解决方法
from org.apache.lucene.analysis.core import LowerCaseFilter,WhitespaceTokenizer
from org.apache.pylucene.analysis import PythonAnalyzer
from org.apache.lucene.analysis import Analyzer
class MyAnalyzer(PythonAnalyzer):
def __init__(self):
PythonAnalyzer.__init__(self)
def createComponents(self,fieldName):
source = WhitespaceTokenizer()
result = LowerCaseFilter(source)
return Analyzer.TokenStreamComponents(source,result)
如果有人能指出我正确的方向以便能够正确地找到这些答案,那就太好了。