使用pdfjs将PDF转换为HTMLXML sturcture

问题描述

我发现此代码以HTML(而非图像)显示PDF。 只有当我可以获取所有页面时,我才不知道当时的工作方式,无法将pdf的最后一页转换为html之类的xml。集中精力只是获得可以检索数据的数据结构。

我无法在日志中获取最后一页的结果。因此,如果有1页PDF,请不要转换。我喜欢将数据从PDF转换为类似HTML(XML)的结构。

var __PDF_DOC,__CURRENT_PAGE,__TOTAL_PAGES,__PAGE_RENDERING_IN_PROGRESS = 0,__CANVAS = $('#pdf-canvas').get(0),_x ="",__o=0,__CANVAS_CTX = __CANVAS.getContext('2d');

function showPDF(pdf_url) {
    $("#pdf-loader").show();

    PDFJS.getDocument({ url: pdf_url }).then(function(pdf_doc) {
        __PDF_DOC = pdf_doc;
        __TOTAL_PAGES = __PDF_DOC.numPages;
        
        // Hide the pdf loader and show pdf container in HTML
        $("#pdf-loader").hide();
        $("#pdf-contents").show();
        $("#pdf-total-pages").text(__TOTAL_PAGES);

        // Show the first page
        showPage(1);
    }).catch(function(error) {
        // If error re-show the upload button
        $("#pdf-loader").hide();
        $("#upload-button").show();
        
        alert(error.message);
    });;
}

function showPage(page_no) {
    __PAGE_RENDERING_IN_PROGRESS = 1;
    __CURRENT_PAGE = page_no;

    // disable Prev & Next buttons while page is being loaded
    $("#pdf-next,#pdf-prev").attr('disabled','disabled');

    // While page is being rendered hide the canvas and show a loading message
    $("#pdf-canvas").hide();
    $("#page-loader").show();

    // Update current page in HTML
    $("#pdf-current-page").text(page_no);
    
    // Fetch the page
    __PDF_DOC.getPage(page_no).then(function(page) {
        // As the canvas is of a fixed width we need to set the scale of the viewport accordingly
        var scale_required = __CANVAS.width / page.getViewport(1).width;

        // Get viewport of the page at required scale
        var viewport = page.getViewport(scale_required);

        // Set canvas height
        __CANVAS.height = viewport.height;

        var renderContext = {
            canvasContext: __CANVAS_CTX,viewport: viewport
        };
        
        // Render the page contents in the canvas
        page.render(renderContext).then(function() {
            __PAGE_RENDERING_IN_PROGRESS = 0;

            // Re-enable Prev & Next buttons
            $("#pdf-next,#pdf-prev").removeAttr('disabled');

            // Show the canvas and hide the page loader
            $("#pdf-canvas").show();
            $("#page-loader").hide();

            // Return the text contents of the page after the pdf has been rendered in the canvas
            return page.getTextContent();
        }).then(function(textContent) {
            // Get canvas offset
            var canvas_offset = $("#pdf-canvas").offset();

            // Clear HTML for text layer
            sor();
            $("#text-layer").html('');

            // Assign the CSS created to the text-layer element
            $("#text-layer").css({ left: canvas_offset.left + 'px',top: canvas_offset.top + 'px',height: __CANVAS.height + 'px',width: __CANVAS.width + 'px' });

            // Pass the data to the method for rendering of text over the pdf canvas.
            PDFJS.renderTextLayer({
                textContent: textContent,container: $("#text-layer").get(0),viewport: viewport,textDivs: []
            });
        });
    });
}

// Upon click this should should trigger click on the #file-to-upload file input element
// This is better than showing the not-good-looking file input element
$("#upload-button").on('click',function() {
$("#pdf-main-container").show();
var _x ="";
    $("#file-to-upload").trigger('click');
});

// When user chooses a PDF file
$("#file-to-upload").on('change',function() {
    // Validate whether PDF
    if(['application/pdf'].indexOf($("#file-to-upload").get(0).files[0].type) == -1) {
        alert('Error : Not a PDF');
        return;
    }

    $("#upload-button").hide();
    //_x =" ";
    //__CURRENT_PAGE = undefined;
    // Send the object url of the pdf
    showPDF(URL.createObjectURL($("#file-to-upload").get(0).files[0]));
});

// PrevIoUs page of the PDF
$("#pdf-prev").on('click',function() {
    if(__CURRENT_PAGE != 1)
        showPage(--__CURRENT_PAGE);
});

// Next page of the PDF
$("#pdf-next").on('click',function() {
    if(__CURRENT_PAGE != __TOTAL_PAGES)
        showPage(++__CURRENT_PAGE);
});
function sor() {
    ++__o;
    if(__o==__CURRENT_PAGE){
    _x = _x + $("#text-layer").html();
    
    }
    if((__CURRENT_PAGE != __TOTAL_PAGES) && __o == __CURRENT_PAGE){
        showPage(++__CURRENT_PAGE);
        }
    else {
    
    _x = _x + $("#text-layer").html();
    var _y= _x.replace(/(style="([^>]+)")/gi,"");
    $("#text-layer,#pdf-canvas").html(" ");
    $("#upload-button").show();
    $("#pdf-main-container").show();
    console.log(_y);
        }
    }
#upload-button {
    width: 150px;
    display: block;
    margin: 20px auto;
}

#file-to-upload {
    display: none;
}

#pdf-main-container {
    width: 400px;
    margin: 20px auto;
}

#pdf-loader {
    display: none;
    text-align: center;
    color: #999999;
    font-size: 13px;
    line-height: 100px;
    height: 100px;
}

#pdf-contents {
    display: none;
}

#pdf-Meta {
    overflow: hidden;
    margin: 0 0 20px 0;
    z-index: 2;
    position: relative;
}

#pdf-buttons {
    float: left;
}

#page-count-container {
    float: right;
}

#pdf-current-page {
    display: inline;
}

#pdf-total-pages {
    display: inline;
}

#pdf-canvas {
    border: 1px solid rgba(0,0.2);
    Box-sizing: border-Box;
}

#page-loader {
    height: 100px;
    line-height: 100px;
    text-align: center;
    display: none;
    color: #999999;
    font-size: 13px;
}

#text-layer { 
   position: absolute;
    left: 0;
    top: 0;
    right: 0;
    bottom: 0;
    overflow: hidden;
    opacity: 0.2;
    line-height: 1.0;
}

#text-layer > div {
    color: transparent;
    position: absolute;
    white-space: pre;
    cursor: text;
    transform-origin: 0% 0%;
}
<!DOCTYPE html>
<html>
<head>
<Meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<Meta name="viewport" content="width=device-width,initial-scale = 1.0,maximum-scale = 1.0,user-scalable=no">
<script src="https://ajax.googleapis.com/ajax/libs/jquery/2.2.4/jquery.min.js"></script>
<script src="https://intaxing.in/js/pdf.js"></script>
<script src="https://intaxing.in/js/pdf.worker.js"></script>

</head>

<body>

<button id="upload-button">Select PDF</button> 
<input type="file" id="file-to-upload" accept="application/pdf" />

<div id="pdf-main-container">
    <div id="pdf-loader">Loading document ...</div>
    <div id="pdf-contents">
        <div id="pdf-Meta">
            <div id="pdf-buttons">
                <button id="pdf-prev">PrevIoUs</button>
                <button id="pdf-next">Next</button>
            </div>
            <div id="page-count-container">Page <div id="pdf-current-page"></div> of <div id="pdf-total-pages"></div></div>
        </div>
        <canvas id="pdf-canvas" width="400"></canvas>
        <div id="text-layer"></div>
        <div id="page-loader">Loading page ...</div>
    </div>
</div>

</body>
</html>

解决方法

我只是使用showpage()并重新加载最后一页两次。

if(__o < __TOTAL_PAGES ){
    showPage(++__CURRENT_PAGE);
    }
else  if(__o == __TOTAL_PAGES){
showPage(__TOTAL_PAGES);

    }
else{
    var _y= _x.replace(/(style="([^>]+)")/gi,"");
    $("#text-layer,#pdf-canvas").html(" ");
    $("#upload-button").show();
    $("#pdf-main-container").show();
    console.log(_y);
    }

因此最后一页重新加载两次。这段代码只是添加了代码,所以现在我将日志_y

中的每个页面都获取了