字符串文字中新行字符的新行计数器[已更新]

问题描述

注意：这是我之前的问题，但我将其简化为所需的结构。如果仍然无法令人满意，请让我知道格式有什么问题，以供将来参考。

我有一个课堂项目，必须为TIP创建一个词法分析器。我在计算要显示在屏幕左侧的换行符时出错。当前，它具有一个正则表达式，每当使用\ n或\ r时，它都会增加line_counter变量。我的问题是，它不会拾取字符串中的新线符号。该程序将显示它与换行符的预期效果，但是不会增加计数器。

期望的输出：

如上一段所述，我需要换行计数器来确认字符串文字中的换行符号。
我需要最后一行的编号为20（如预期输出所示），而不是17（如下面的实际输出所示）。

我做了这么远的事情

在字符串文字正则表达式中添加其他if语句，用于在字符串中找到\ n或\ r，然后增加行计数器。
将行计数器正则表达式放在规则文件中的不同位置，以确保不会被另一个正则表达式覆盖。

错误：我没有收到任何错误。

以下是我在此问题中引用的相关正则表达式的摘要。另外，问题的底部是实际和预期的输出。

规则文件的片段

 /* STRING LIteraL REGEX */
[']([^'\\]|\\(.|\n))*[']       { if(yyleng <= 80)
                                    {
                                        return TOK_STRINGLIT; 
                                    }
                                    else
                                    {
                                        return TOK_UNKNowN;
                                    }
                                }

 /* REGEX TO COUNT NEW LInes */
[\r\n]         { line_number++; }

预期输出

line: 16,lexeme: |'This string
has
newlines
inside of it'|,length: 43,token: 4003
line: 20,lexeme: |&|,length: 1,token: 6000
ERROR: unkNown token

实际输出

line: 16,lexeme: |'This string
has
newlines 
inside of it'|,token: 4003
line: 17,token: 6000 
ERROR: unkNown token

可复制文件

LEXER.H

//*****************************************************************************
// CSE 4713 / 6713 Project - List of tokens for TIPS
//*****************************************************************************

#ifndef LEXER_H
#define LEXER_H

// List of token codes

// Keywords
#define TOK_BEGIN    1000
#define TOK_BREAK    1001
#define TOK_CONTINUE 1002
#define TOK_DOWNTO   1003
#define TOK_ELSE     1004
#define TOK_END      1005
#define TOK_FOR      1006
#define TOK_IF       1007
#define TOK_LET      1008
#define TOK_PROGRAM  1009
#define TOK_READ     1010
#define TOK_THEN     1012
#define TOK_TO       1013
#define TOK_VAR      1014
#define TOK_WHILE    1015
#define TOK_WRITE    1016

// Datatype Specifiers
#define TOK_INTEGER  1100
#define TOK_REAL     1101

// Punctuation
#define TOK_SEMICOLON  2000
#define TOK_COLON      2001
#define TOK_OPENPAREN  2002
#define TOK_CLOSEPAREN 2003
#define TOK_OPENBRACE  2004
#define TOK_CLOSEBRACE 2005

// Operators
#define TOK_PLUS        3000
#define TOK_MINUS       3001
#define TOK_MULTIPLY    3002
#define TOK_DIVIDE      3003
#define TOK_ASSIGN      3004
#define TOK_EQUALTO     3005
#define TOK_LEsstHAN    3006
#define TOK_GREATERTHAN 3007
#define TOK_NOTEQUALTO  3008
#define TOK_MOD         3009
#define TOK_NOT         3010
#define TOK_OR          3011
#define TOK_AND         3012

// Useful abstractions
#define TOK_IDENT       4000  // identifier
#define TOK_INTLIT      4001  // integer literal
#define TOK_FLOATLIT    4002  // floating point literal
#define TOK_STRINGLIT   4003  // string literal
#define TOK_EOF         5000  // end of file
#define TOK_EOF_SL      5001  // end of file while parsing a string literal
#define TOK_UNKNowN     6000  // unkNown lexeme

#endif

DRIVER.CPP

//*****************************************************************************
// CSE 4713 / 6713 Project Part 1 - Lexical Analyzer Driver
// Fall 2020
//*****************************************************************************

#ifdef _MSC_VER
#define _CRT_SECURE_NO_WARNINGS
#endif

#include <stdio.h>
#include "lexer.h"


// Instantiate global variables
extern "C"
{
extern FILE *yyin;         // input stream
extern FILE *yyout;        // output stream
extern int   yyleng;       // length of current lexeme
extern char *yytext;       // text of current lexeme
extern int   yylex();      // the generated lexical analyzer

extern int   line_number;  // current line number of the input
}

// Do the analysis
int main( int argc,char* argv[] ) {
  int token;   // hold each token code

  // Set the input stream
  if (argc > 1) {
    printf("INFO: Using the file %s for input\n",argv[1]);
    yyin = fopen(argv[1],"r");
    if (!yyin) {
      printf("   ERROR: input file not found\n");
      return (-1);
    }
  }
  else {
    printf("INFO: Using stdin for input,use EOF to end input\n");
    printf("      Windows EOF is Ctrl+z,Linux EOF is Ctrl+d\n");
    yyin = stdin;
  }

  // Set the output stream
  yyout = stdout;
  
  // Do the lexical parsing
  token = yylex();
  while( token != TOK_EOF ) 
  {
    // What did we find?
    fprintf(yyout,"line: %d,lexeme: |%s|,length: %d,token: %d\n",line_number,yytext,yyleng,token);
    
    // Is it an error?
    if( token == TOK_UNKNowN )
      fprintf(yyout,"   ERROR: unkNown token\n");
    if( token == TOK_EOF_SL )
      fprintf(yyout,"   ERROR: end of file while in a string literal\n");
    
    // Get the next token
    token = yylex();
  }
  return 0;
}

RULES.L

/******************************************************************* 
Starting point your rules.l file for TIPS
Name: Stephanie Schisler                NetID: sas880
Course: CSE 4713                        Assignment: Part 1
Programming Environment: WSL C++
Purpose of File: Contains the rules for the project.
*******************************************************************/
%option noyywrap
%{
#include "lexer.h"

// global variable to hold current line number being read
int line_number = 1;

%}

%%

 /* Keywords */ 
BEGIN           { return TOK_BEGIN; }
BREAK           { return TOK_BREAK; }
CONTINUE        { return TOK_CONTINUE; }
DOWNTO          { return TOK_DOWNTO; }
ELSE            { return TOK_ELSE; }
END             { return TOK_END; }
FOR             { return TOK_FOR; }
IF              { return TOK_IF; }
LET             { return TOK_LET; }
PROGRAM         { return TOK_PROGRAM; }
READ            { return TOK_READ; }
THEN            { return TOK_THEN; }
TO              { return TOK_TO; }
VAR             { return TOK_VAR; }
WHILE           { return TOK_WHILE; }
WRITE           { return TOK_WRITE; }

 /* Datatype Specifiers */
INTEGER         { return TOK_INTEGER; }
REAL            { return TOK_REAL; }

 /* Punctuation */
\;           { return TOK_SEMICOLON; }
\:           { return TOK_COLON; }
\(          { return TOK_OPENPAREN; }
\)          { return TOK_CLOSEPAREN; }
\{          { return TOK_OPENBRACE; }
\}          { return TOK_CLOSEBRACE; }

 /* Operators */
\+          { return TOK_PLUS; }
-           { return TOK_MINUS; }
\*          { return TOK_MULTIPLY; }
\/          { return TOK_DIVIDE; }
\:=          { return TOK_ASSIGN; }
\=           { return TOK_EQUALTO; }
\<           { return TOK_LEsstHAN; }
\>           { return TOK_GREATERTHAN; }
\<>          { return TOK_NOTEQUALTO; }
MOD         { return TOK_MOD; }
NOT         { return TOK_NOT; }
OR          { return TOK_OR; }
AND         { return TOK_AND; }

 /* Abstractions */
[A-Z][0-9A-Z]{0,7}           { return TOK_IDENT; }       
[0-9]+                       { return TOK_INTLIT; }      
[0-9]+[.]?[0-9]+             { return TOK_FLOATLIT; }


[']([^'\\]|\\(.|\n))*[']       { if(yyleng <= 80)
                                    {
                                        return TOK_STRINGLIT; 
                                    }
                                    else
                                    {
                                        return TOK_UNKNowN;
                                    }
                                }

"\[[^"\\]|\\(.|\n)]*|'\[[^'\\]|\\(.|\n)]*            { return TOK_EOF_SL; }

 /* Count new lines */
[\r\n]         { line_number++; }

 /* Eat any whitespace */
[\t ]*

 /* Found an unkNown character */

.         { return TOK_UNKNowN; }

 /* Recognize end of file */

<<EOF>>   { return TOK_EOF; }

输入文件


ABCDEFGH
ABCDEFGHIJ
AB123 123AB A123ZZ  SUM  IFFInesS
AB_123
ab_123
123 
3219012894910
12.132 
.123
0.132
-123 -12.324
%%%%%%%
^
'This is a string'
'This string    has tabs        inside of it.'
'This string
 has 
 newlines
 inside of it'
&

正确的输出


ABCDEFGH
ABCDEFGHIJ
AB123 123AB A123ZZ  SUM  IFFInesS
AB_123
ab_123
123 
3219012894910
12.132 
.123
0.132
-123 -12.324
%%%%%%%
^
'This is a string'
'This string    has tabs        inside of it.'
'This string
 has 
 newlines
 inside of it'
&

MAKEFILE

###############################################################################
# CSE 4713 / 6713 Project Part 1 - Lexical Analyzer (flex)
#
# 'make'        build executable file
# 'make clean'  removes all intermediate (lex.yy.c and *.o) and executable files
#
# This makefile purposely avoids macros to make the rules more clear.
# For more information about makefiles:
#      http://www.cs.colby.edu/maxwell/courses/tutorials/maketutor/
#      http://www.cs.swarthmore.edu/~newhall/unixhelp/howto_makefiles.html
#      http://www.gnu.org/software/make/manual/make.html
#
###############################################################################

lex.exe: lex.yy.o driver.o
    g++ -g -o lex.exe lex.yy.o driver.o

driver.o: driver.cpp lexer.h
    g++ -g -o driver.o -c driver.cpp

lex.yy.o: lex.yy.c lexer.h
    gcc -g -o lex.yy.o -c lex.yy.c

lex.yy.c: rules.l lexer.h
    flex -o lex.yy.c rules.l

clean: 
    $(RM) *.o lex.yy.c lex.exe

解决方法

您的行计数器不会被字符串中的换行符增加，因为您对字符串模式的操作不会改变行计数器。

（F）lex词法分析器根据您提供的模式将输入划分为标记，并为每个标记执行关联的操作。模式在其他模式中不匹配：这会导致混乱。（例如，tiffany不包含if令牌。它是一个不可分割的标识符。）

从使用flex构建的词法分析器中获取准确的行数的最简单方法是包括该选项

 %option yylineno

在您的序言中（flex输入文件中第一个%%之前的部分）。完成后，flex将为您做所有事情，yylineno将始终包含行号计数。（它包含令牌的 end 处的行号计数，这一点很重要：如果您想知道多行令牌从哪一行开始，则需要多做一点。）

可能已被告知不要使用yylineno。（就个人而言，我认为这样的作业限制是错误的，但我并不总是与导师见面。）如果是这种情况，您将需要自动执行flex为您做的事情，重新扫描任何可能包含换行符的令牌以计算其包含的换行数（如果有）：

[']([^'\\]|\\(.|\n))*[']       { for (const char* p = yytext; *p; ++p) {
                                   if (*p == '\n') ++line_number;
                                 }
                                 /* Rest of the string action */ 
                                 ...

您可以使用开始条件来处理字符串文字的内部结构，从而使重新扫描更加高效（也适用于flex生成的词法分析器的重新扫描，因为它将添加代码以完全执行上述操作）。例如，您可以将该代码更改为：

%x STRING_LITERAL
%%
[']                  { yymore(); BEGIN(STRING_LITERAL); }
<STRING_LITERAL>{
  [^'\\\n]+          { yymore(); }
  \\?\r?\n           { ++line_number; yymore(); }
  \\.                { yymore(); }
  [']                { BEGIN(INITIAL); /* Return to the normal scan */
                       /* At this point,yytext and yyleng refer to the
                        * entire token,including the ' marks. So you can
                        * now do exactly what you did in your string literal
                        * action. But see below for some comments.
                        */
                        ...
                      }
}
   /* Other lexical rules continue here */

看看flex manual section on start conditions，详细了解启动条件如何工作。

请注意，除了最后一个实际接受令牌的操作之外，字符串令牌内的所有操作均包含对yymore()的调用。 yymore()是一个特殊的（f）lex动作，它告诉分析器该令牌尚未完成，并且下一个模式将与当前令牌的另一部分匹配。

关于代码的一些说明：

您的行尾模式[\r\n]与 \r或\n匹配。 Windows的行尾实际上是两个字符的序列\r\n，因此如果遇到一个line_number，您将最终两次递增\r。（但是，除非您以二进制模式打开输入文件，否则您不太可能遇到一个，因为标准库在读取行时应删除\r?\n。）匹配Windows或Windows的正确模式Unix的行尾序列是
```
\r
```
，它将匹配整个行尾。有时您会看到更复杂的模式，如果您的讲师在乎历史文物博物馆，再也看不到了，他们可能会要求您应对本世纪未使用的惯例，例如OS-pre- X Apple仅使用\n的约定。我的建议是拒绝这样做。只需计算\r，无论它们之前是否有a。在您可能会遇到的任何系统上，这都是正确的。
您对过长字符串的测试是不精确的（我认为我们在另一个问题中提到了这个事实）。首先，它在字符串文字的长度中计算引号，这可能是不正确的。其次，它根据转义序列本身的长度来计算转义序列，而不是将转义序列转换成的单个字符。这将导致将\x61视为单个字符，但将语义上等效的yyleng视为四个字符，这可能会导致完全有效的少于80个字符长的文字被您的拒绝词法扫描器。如果使用开始条件解决方案对上面建议的行进行计数，则还可以通过单独计数多余字符来更正文字的计算长度。（您无法更改令牌内的yymore()，因为yytext取决于开始能够维护令牌和{{1}}。请参见上面的手动链接。）

c++flex-lexer lexical-analysis regex regex regex