问题描述
目标:阅读 PDF 文件并将其转换为文本格式。
我为此使用了 SAS“Proc Groovy”和“Java Apache PDFBox Library”。但是由于 Apache PDFBox 版本已从 2.0.21 更改为 2.0.22,因此代码出现错误。请建议必须对此代码进行哪些更改,以便再次运行。
SAS Proc Groovy 能够运行 Java 代码。所以我们在 SAS Proc Groovy 中使用 Java PDF Library (Apache PDFBox) 将 PDF 转换为文本格式。
代码:
filename overview "&temp/overview.pdf";
filename ov_text "&temp/overview.txt";
* download a pdf document;
proc http
url="https://cdn.nar.realtor/sites/default/files/documents/ehs-11-2020-overview-2020-12-22.pdf"
method="get"
proxyhost="&proxy_host."
proxyport=&port
out=overview;
run;
* download the Apache PDFBox library (a .jar file);
filename jar '&temp/pdfBox.jar';
%if %sysfunc(FEXIST(jar)) ne 1 %then %do;
proc http
url='https://www.apache.org/dyn/closer.lua?filename=pdfBox/2.0.22/pdfBox-app-2.0.22.jar&action=download'
proxyhost="&proxy_host."
proxyport=&port
out=jar;
run;
%end;
* Use GROOVY to read the PDF,strip out the text and position,and write that
* parse to a text file which SAS can read;
proc groovy classpath="&temp.jar";
submit
"%sysfunc(pathname(overview))" /* the input,a pdf file */
"%sysfunc(pathname(ov_text))" /* the output,a text file */
;
import org.apache.pdfBox.pdmodel.PDDocument;
import org.apache.pdfBox.text.PDFTextStripper;
import org.apache.pdfBox.text.TextPosition;
import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.List;
import java.io.FileWriter;
import java.io.PrintWriter;
public class GetLinesFromPDF extends PDFTextStripper {
static List<String> lines = new ArrayList<String>();
public GetLinesFromPDF() throws IOException {
}
/**
* @throws IOException If there is an error parsing the document.
*/
public static void main( String[] args ) throws IOException {
PDDocument document = null;
PrintWriter out = null;
String inPdf = args[0];
String outTxt = args[1];
try {
document = PDDocument.load( new File(inPdf) );
PDFTextStripper stripper = new GetLinesFromPDF();
stripper.setSortByPosition( true );
stripper.setStartPage( 0 );
stripper.setEndPage( document.getNumberOfPages() );
Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
stripper.writeText(document,dummy);
out = new PrintWriter(new FileWriter(outTxt));
// print lines to text file
for(String line:lines){
out.println(line);
}
}
finally {
if( document != null ) {
document.close();
}
if( out != null ) {
out.close();
}
}
}
/**
* Override the default functionality of PDFTextStripper.writeString()
*/
@Override
protected void writeString(String str,List<TextPosition> textPositions) throws IOException {
String places = "";
for(TextPosition tp:textPositions){
places += "(" + tp.getX() + "," + tp.getY() + ") ";
}
lines.add(str + " found @ " + places);
}
}
endsubmit;
quit;
* preview the stripped text that was saved;
data _null_;
infile ov_text;
input;
putlog _infile_;
run;
ERROR: The SUBMIT command Failed.
groovy.lang.GroovyRuntimeException: Failed to create Script instance for class: class GetLinesFromPDF. Reason:
java.util.zip.ZipException: error reading zip file
at org.codehaus.groovy.runtime.InvokerHelper.createScript(InvokerHelper.java:475)
at groovy.lang.groovyshell.parse(groovyshell.java:689)
at groovy.lang.groovyshell.parse(groovyshell.java:725)
at groovy.lang.groovyshell.parse(groovyshell.java:716)
Caused by: java.util.zip.ZipException: error reading zip file
at java.util.zip.ZipFile.read(Native Method)
at java.util.zip.ZipFile.access$1400(ZipFile.java:60)
at java.util.zip.ZipFile$ZipFileInputStream.read(ZipFile.java:734)
at java.util.zip.ZipFile$ZipFileInflaterInputStream.fill(ZipFile.java:434)
at java.util.zip.InflaterInputStream.read(InflaterInputStream.java:158)
at java.io.FilterInputStream.read(FilterInputStream.java:133)
at sun.nio.cs.StreamDecoder.readBytes(StreamDecoder.java:284)
at sun.nio.cs.StreamDecoder.implRead(StreamDecoder.java:326)
at sun.nio.cs.StreamDecoder.read(StreamDecoder.java:178)
at java.io.InputStreamReader.read(InputStreamReader.java:184)
at java.io.BufferedReader.fill(BufferedReader.java:161)
at java.io.BufferedReader.readLine(BufferedReader.java:324)
at java.io.BufferedReader.readLine(BufferedReader.java:389)
at org.apache.pdfBox.pdmodel.font.encoding.GlyphList.loadList(GlyphList.java:147)
at org.apache.pdfBox.pdmodel.font.encoding.GlyphList.<init>(GlyphList.java:137)
at org.apache.pdfBox.text.LegacyPDFStreamEngine.<init>(LegacyPDFStreamEngine.java:120)
at org.apache.pdfBox.text.PDFTextStripper.<init>(PDFTextStripper.java:214)
解决方法
暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!
如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。
小编邮箱:dio#foxmail.com (将#修改为@)