映射文本和html排名搜索

问题描述

我有文字,而此文字带有html

很久以前,有一个非常有钱的人和他的三个女儿住在一起。这两个大女儿嘲笑任何穿着不像他们那么温顺的人。如果他们两个不在家休息,他们就出去买尽可能多的精美衣服和帽子。

<span>
    <div style="font-family:Calibri,Helvetica,sans-serif; font-size:12pt; color:rgb(0,0)">
        <p class="p1" style="margin: 0px; font: 17px; font-family: Helvetica Neue"><b>Once upon a time there was
                a
                very rich man who <span style="color: blue">lived</span> with his three daughters.<span class="Apple-converted-space">&nbsp;
                </span>The two older daughters laughed at anyone who di<span style="color: orange">d n</span>ot dress <span style="color: green">as</span> wel as they did.<span
                    class="Apple-converted-space">&nbsp; </span>If the two of them were not resting at home,they were out shopping for as many fine dresses and hats as they could <span style="color: red">carry</span> home. <span
                    class="Apple-converted-space">&nbsp;</span></b></p><br>
    </div>
    <div style="font-family:Calibri,0)">
    </div>
</span>

需要一个通用的解决方案来查找从文本到html的单词/短语的位置。问题是单词/短语中可能有一些样式

di<span style="color: orange">d n</span>ot

尝试使用Levenshtein距离监听班次,但这是一个非常“困难”的解决方案

解决方法

let html = document.getElementById('input').innerHTML;
let word = 'did not';

console.log(searchPositions(html,word));

function searchPositions(html,issueText) {
    let htmlArr = Array.from(html).map((item,index) => {
        return {
            item,index
        }
    });

    const regexp = /<\/?[^>]+(>|$)/g;
    const tags = html.match(regexp) || [];
    const textTrue = html.replace(/<\/?[^>]+(>|$)/g,'');

    let inTextStartPosition = textTrue.indexOf(issueText);
    let inTextEndPosition = inTextStartPosition + issueText.length - 1;

    let matches = [...html.matchAll(regexp)];
    let tagsIndexs = matches.map((item) => {
        return item.index;
    });

    let tagsInfo = tags.map((item,index) => {
        let length = item.length;
        let startPosition = tagsIndexs[index];
        let endPosition = startPosition + length;

        return {
            startPosition,endPosition,length
        }
    })

    for (let ii = 0; ii < tagsInfo.length; ii++) {
        let startPosition = tagsInfo[ii].startPosition;
        let endPosition = tagsInfo[ii].endPosition;

        while (startPosition !== endPosition) {
            htmlArr = htmlArr.filter(x => x.index !== startPosition);
            startPosition++;
        }
    }

    let start = htmlArr[inTextStartPosition].index;
    let end = htmlArr[inTextEndPosition].index;

    return {
        start,end
    }
}
<div id='input'>
        <span>
            <div style="font-family:Calibri,Helvetica,sans-serif; font-size:12pt; color:rgb(0,0)">
                <p class="p1" style="margin: 0px; font: 17px; font-family: Helvetica Neue"><b>Once upon a time there was
                        a
                        very rich man who <span style="color: blue">lived</span> with his three daughters.<span
                            class="Apple-converted-space">&nbsp;
                        </span>The two older daughters laughed at anyone who di<span style="color: orange">d n</span>ot
                        dress <span style="color: green">as</span> wel as they did.<span
                            class="Apple-converted-space">&nbsp; </span>If the two of them were not resting at home,they were out shopping for as many fine dresses and hats as they could <span
                            style="color: red">carry</span> home. <span class="Apple-converted-space">&nbsp;</span></b>
                </p><br>
            </div>
            <div style="font-family:Calibri,0)">
            </div>
        </span>
    </div>

相关问答

错误1:Request method ‘DELETE‘ not supported 错误还原:...
错误1:启动docker镜像时报错:Error response from daemon:...
错误1:private field ‘xxx‘ is never assigned 按Alt...
报错如下,通过源不能下载,最后警告pip需升级版本 Requirem...