问题描述
我正在尝试使用 Aspose.pdf 和正则表达式在 pdf 文件中查找某些单词。代码运行没有错误,但永远不会返回 TRUE。
Public Shared Function FindInPDF(sourcePdf As String,searchPhrase As String) As Boolean
Try
' Open document
Dim pdfDocument = New Document(sourcePdf)
' "D[a-z]{7}"
' Create TextAbsorber object to find all the phrases matching the regular expression
Dim absorber As Aspose.Pdf.Text.TextFragmentAbsorber = New Aspose.Pdf.Text.TextFragmentAbsorber(searchPhrase) With {
.TextSearchOptions = New TextSearchOptions(True)
}
' Accept the absorber for all the pages
pdfDocument.Pages.Accept(absorber)
' Loop through the fragments
For Each textFragment As Aspose.Pdf.Text.TextFragment In absorber.TextFragments
Console.WriteLine("Text : {0} ",textFragment.Text)
FindInPDF = True
Next
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
Return FindInPDF
End Function
通过searchPhrase
在函数中插入正则表达式字符串
解决方法
我没有使用付费图书馆 Aspose.pdf
,而是改用了 iTextSharp
。它具有相同的功能。
Public Shared Function GetTextFromPDF2(ByVal PdfFileName As String,searchPhrase As String) As Boolean
Try
Dim oReader As New iTextSharp.text.pdf.PdfReader(PdfFileName)
Dim sOut = ""
For i = 1 To oReader.NumberOfPages
Dim its As New iTextSharp.text.pdf.parser.SimpleTextExtractionStrategy
sOut &= iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(oReader,i,its)
Dim adrRx As Regex = New Regex(searchPhrase)
Dim keyword As New List(Of String)
For Each item As Match In adrRx.Matches(sOut.ToLower)
keyword.Add(item.Value)
GetTextFromPDF2 = True
Next
Next
Catch ex As Exception
MessageBox.Show(ex.Message)
End Try
Return GetTextFromPDF2
End Function