使用pdfreader从pdf获取信息

问题描述

我已经成功使用以下https://www.npmjs.com/package/pdfreader#raw-pdf-reading库从pdf获取数据。如果pdf在浏览器中在线打开,则一切正常。但是现在我偶然发现了以下链接http://www.creperiet.nu/Homepage/Download-File/f/1219907/h/45405c646fdb41f8dd40188584afdfdf/Dagens+Weekly+Men,它是直接下载的。我可以使用该链接做什么,以使其在浏览器中“在线”打开而不是直接下载吗?还是我可以使用pdfreader做一些事情使其与这种类型的pdf一起使用?

module.exports.getAndUpdatePdfMenu = async function (url){
    var buffer = await bufferize(url);
    var lines = await readlines(buffer);
    lines = await JSON.parse(JSON.stringify(lines));
    console.log(lines); 
    return lines; 
}


async function bufferize(url) {
  var hn = url.substring(url.search("//") + 2);
  hn = hn.substring(0,hn.search("/"));
  var pt = url.substring(url.search("//") + 2);
  pt = pt.substring(pt.search("/"));
  const options = { hostname: hn,port: 443,path: pt,method: "GET",useHttps: url.startsWith("https") };
  return new Promise(function(resolve,reject) {
    var buff = new Buffer.alloc(0);
    if(options.useHttps){
      const req = https.request(options,res => {
        res.on("data",d => {
          buff = Buffer.concat([buff,d]);
        });
        res.on("end",() => {
          resolve(buff);
        });
      });
      req.on("error",e => {
        console.error("https request error: " + e);
      });
      req.end();
    }else{
      const req = http.request(options,e => {
        console.error("https request error: " + e);
      });
      req.end();
    }
   
  });
}

/*
if second param is set then a space ' ' inserted whenever text 
chunks are separated by more than xwidth 
this helps in situations where words appear separated but
this is because of x coords (there are no spaces between words) 

each page is a different array element
*/
async function readlines(buffer,xwidth) {
  return new Promise((resolve,reject) => {
    var pdftxt = new Array();
    var pg = 0;
    new pdfreader.PdfReader().parseBuffer(buffer,function(err,item) {
      if (err) console.log("pdf reader error: " + err);
      else if (!item) {
        pdftxt.forEach(function(a,idx) {
          pdftxt[idx].forEach(function(v,i) {
            pdftxt[idx][i].splice(1,2);
          });
        });
        resolve(pdftxt);
      } else if (item && item.page) {
        pg = item.page - 1;
        pdftxt[pg] = [];
      } else if (item.text) {
        var t = 0;
        var sp = "";
        pdftxt[pg].forEach(function(val,idx) {
          if (val[1] == item.y) {
            if (xwidth && item.x - val[2] > xwidth) {
              sp += " ";
            } else {
              sp = "";
            }
            pdftxt[pg][idx][0] += sp + item.text;
            t = 1;
          }
        });
        if (t == 0) {
          pdftxt[pg].push([item.text,item.y,item.x]);
        }
      }
    });
  });
}

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)