在特定的str匹配和标头之后,awk对分组行之间的值求和

问题描述

我已经在awk中安装了该程序:

BEGIN {
  FS="[>;]"
  OFS=";"
}

function p(a,i)
{
   for(i in a)
     print ">" i,"*nr=" ln

}
/^>/ {p(out);ln=0;split("",out);next}
/[*]/  {idx=$2 OFS $3; out[idx]}
{ln++}
END {
  if (ln) p(out)
}

它适用于这样的文件

>Cluster 300
0   151nt,>last238708;size=1... *
>Cluster 301
0   141nt,>last103379;size=1... at -/99.29%
1   151nt,>last104482;size=1... *
>Cluster 302
0   151nt,>last104505;size=1... *
>Cluster 303
0   119nt,>last325860;size=1... at +/99.16%
1   122nt,>last106751;size=1... at +/99.18%
2   151nt,>last284418;size=1... *
3   113nt,>last8067;size=3... at -/100.00%
4   122nt,>last8102;size=3... at -/100.00%
5   135nt,>last14200;size=2... at +/99.26%
>Cluster 304
0   151nt,>last285146;size=1... *

我需要的是,该程序为每个群集打印带有星号的行的id(lastxxxxxx),并计算所有“ size =”数字的总和。例如对于群集303,它必须输出以下内容

>last284418;nr=11 对于群集304: >last285146;nr=1 目前,我的代码只能对行进行计数和求和,但没有考虑“ size =”值。 感谢您的帮助!

解决方法

您能否仅在GNU awk中尝试使用所示示例进行跟踪,编写和测试。

awk '
/^>Cluster [0-9]+/{
  if(sum){
    print clus_line ORS val_line" = "sum
  }
  val_line=sum=clus_line=""
  clus_line=$0
  next
}
{
  match($0,/size=[0-9]+/)
  line=substr($0,RSTART,RLENGTH)
  sub(/.*size=/,"",line)
  sum+=line
}
/\*$/{
  match($0,/>last[^;]*/)
  val_line=substr($0,RSTART+1,RLENGTH-1)
}
END{
  if(sum){
    print clus_line ORS val_line" = "sum
  }
}'  Input_file

说明: 添加以上详细说明。

awk '                                          ##Starting awk program from here.
/^>Cluster [0-9]+/{                            ##Checking condition if line starts from Cluster with digits in line then do following.
  if(sum){                                     ##Checking if variable sum is NOT NULL then do following.
    print clus_line ORS val_line" = "sum       ##Printing values of clus_line ORS(new line) val_line space = space and sum here. 
  }
  val_line=sum=clus_line=""                    ##Nullifying val_line,sum and clus_line here.
  clus_line=$0                                 ##Assigning current line to clus_line here.
  next                                         ##next will skip all further statements from here.
}
{
  match($0,/size=[0-9]+/)                      ##Using match function to match size= digits in line.
  line=substr($0,RLENGTH)               ##Creating line which has sub-string for current line starts from RSTART till RLENGTH.
  sub(/.*size=/,line)                       ##Substituting everything till size= keyword here with NULL in line variable.
  sum+=line                                    ##Keep on adding value of digits in line variable in sum here.
}
/\*$/{                                         ##Checking condition if a line ends with * then do following.
  match($0,/>last[^;]*/)                       ##Using match function to match >last till semi-colon comes here.
  val_line=substr($0,RLENGTH-1)       ##Creating val_line which has sub-string of current line from RSTART+1 till RLENGTH-1 here.
}
END{                                           ##Starting END block of this program from here.
  if(sum){                                     ##Checking if variable sum is NOT NULL then do following.
    print clus_line ORS val_line" = "sum       ##Printing values of clus_line ORS(new line) val_line space = space and sum here.
  }
}'  Input_file                                 ##Mentioning Input_file name here.