在JavaScript中检测文本中确切的单词位置

问题描述

我有一段文字,其中某些单词可能会重复。我必须检测每个单词的单词出现情况,例如:

var seen = new Map();
tokens.forEach(token => { // for each token
    let item = {
        "word": token
    }
    var pattern = "\\b($1)\\b";
    var wordRegex = new RegExp(pattern.replace('$1',token),"g");

    // calculate token begin end 
    var match = null;
    while ((match = wordRegex.exec(text)) !== null) {
        if (match.index > (seen.get(token) || -1)) {
            var wordStart = match.index;
            var wordEnd = wordStart + token.length - 1;
            item.characterOffsetBegin = wordStart;
            item.characterOffsetEnd = wordEnd;

            seen.set(token,wordEnd);
            break;
        }
    }
});

我已经实施了这种部分有效的方法

function aggressive_tokenizer(text) {
  // most punctuation
  text = text.replace(/([^\w\.\'\-\/\+\<\>,&])/g," $1 ");
  // commas if followed by space
  text = text.replace(/(,\s)/g," $1");
  // single quotes if followed by a space
  text = text.replace(/('\s)/g," $1");
  // single quotes if last char
  text = text.replace(/('$)/," $1");
  text = text.replace(/(\s+[`'"‘])(\w+)\b(?!\2)/g," $2")
  // periods before newline or end of string
  text = text.replace(/\. *(\n|$)/g," . ");
  // replace punct
  // ignore "-" since may be in slang scream
  text = text.replace(/[\\?\^%<>=!&|+\~]/g,"");
  text = text.replace(/[…;,.:*#\)\({}\[\]]/g,"");
  // finally split remainings into words
  text = text.split(/\s+/)
  return text;
}

var seen = new Map();
var text = "Lorem ipsum dolor sit amet,consectetur adipiscing elit,sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

var tokens = aggressive_tokenizer(text);
var indexes = tokens.map(token => { // for each token
  let item = {
    "word": token
  }
  var pattern = "\\b($1)\\b";
  var wordRegex = new RegExp(pattern.replace('$1',"g");

  // calculate token begin end 
  var match = null;
  while ((match = wordRegex.exec(text)) !== null) {
    if (match.index > (seen.get(token) || -1)) {
      var wordStart = match.index;
      var wordEnd = wordStart + token.length - 1;
      item.characterOffsetBegin = wordStart;
      item.characterOffsetEnd = wordEnd;
      seen.set(token,wordEnd);
      break;
    }
  }
  return item;
});
console.log(indexes);

这将在大多数情况下起作用,如下所示:

 var text = "'Lorem ipsum 'dolor sit amet,sed do eiusmod tempor incididunt ut labore et dolore magna aliqua."

在某些情况下,我发现索引缺失:

{
    "word": "'Lorem"
}

在这里,我在某些单词上添加了“'”:“'Lorem”和“'dolor”(这在英语中类似于诸如“'Cause'”之类的缩略语,等等。预期:

pattern = "\\b($1)\\b";

这可能是由于'Cause所致,我使用它来完全匹配单词以获取正确的开始和结束字符偏移量,而令牌生成器会将诸如'Cause之类的某些文本标记为{ {1}},因此要保持重音符号以进一步分析此令牌(就像在NLP管道中转换'cause中的because一样,因此我无法从这些令牌中删除“'”。

另一种尝试是使用正则表达式

pattern = "(?<!\\S)$1(?!\\S)";

'Lorem的情况下有效,但在其他情况下可能会失败。

解决方法

暂无找到可以解决该程序问题的有效方法,小编努力寻找整理中!

如果你已经找到好的解决方法,欢迎将解决方案带上本链接一起发送给小编。

小编邮箱:dio#foxmail.com (将#修改为@)