问题描述
我的报告文件很大。我从其中提取了所需的数据到名为“ new.txt”的新文件中 我想将此数据保存在一个csv文件中,以便正确获取文件中定义的每个标头的列和行。
我无法以excel的形式获取它。我正在使用python 2.7,并且希望不使用pandas软件包来做到这一点。
SIMPLE_FILE报告:
CALL Alias Severity File Line Wt Message
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 785 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1111 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1226 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1354 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1363 1000 message for this block ( syn_ff ) is ignored by syn
这是我的代码。
import os
import sys
from os import path
import struct
outFile = "new.txt"
new_file = open(outFile,"a+")
new_file.truncate(0)
csv_file = "report.csv"
reqlen = 0 # minumum length of row
fs = None
def calclens(line): # calc column widths from header row
global fs,reqlen
rec = [] # all column widths
ctr = 0
for i in range(len(line)-1):
ctr += 1
if line[i] == ' ' and line[i+1] != ' ': # new column
rec.append(ctr) # add to column width list
ctr = 0
rec.append(ctr) # last column
#print rec
rec = rec[:-1] # skip newline
rec[-1] += len(line)-i-1 # last column to end of line
fieldwidths = rec
fmtstring = ' '.join('{}{}'.format(abs(fw),'s') for fw in fieldwidths) # 16s 16s 10s 166s 7s 5s 52s
fs = struct.Struct(fmtstring)
reqlen = len(line)
def open_file(filename):
try:
contents = []
with open(filename,'r') as f1:
contents=[line.strip() for line in f1]
counter = contents.index("MORESIMPLE_FILESORT REPORT:")
for item in contents[counter:]:
new_file.write(item+"\n")
with open("new.txt") as f:
with open(csv_file,'w') as f2:
for i,line in enumerate(f.readlines()):
if i == 0: continue # SIMPLE_FILE REPORT:
if line[0] == '+': continue # skip ++++ line
if i == 1: calclens(line) # header row,calc field positions\lengths
#print(len(line))
if len(line) < reqlen: line += ' '*(reqlen-len(line))
fields = fs.unpack_from(str.encode(line))
fields = [f.strip() for f in fields] # trim all fields
f2.write(','.join(fields)+'\n') # join fields comma separated
except Exception,e:
print str(e)
exit(1)
if __name__ == '__main__':
name = sys.argv[1]
filename = sys.argv[1]
open_file(filename)
解决方法
要拆分固定宽度的文本文件,可以使用结构模块。
下面的代码使用标题行来确定列的位置和宽度,因此标题行的长度必须与数据行的长度相同,并且标题必须正确对齐。
此代码针对Python 2.7。对于Python 3,已记录了所需的更改。
import struct
ss = '''
SIMPLE_FILE REPORT:
CALL Alias Severity File Line Wt Message
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 785 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1111 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1226 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1354 1000 message for this block ( syn_ff ) is ignored by syn
ACT_99 ACT_99 Warning /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png 1363 1000 message for this block ( syn_ff ) is ignored by syn
'''.strip()
with open ('data.txt','w') as f: f.write(ss) # write test file
##################### Main Script #######################
reqlen = 0 # minumum length of row
fs = None
def getmaxlinelen(filename): # scan file for longest line length
mx = 0
with open(filename) as f:
for ln in f.readlines():
if len(ln) > mx: mx = len(ln)
return mx
def calclens(line,mxlen): # calc column widths from header row
global fs,reqlen
rec = [] # all column widths
ctr = 0
for i in range(len(line)-1):
ctr += 1
if line[i] == ' ' and line[i+1] != ' ': # new column
rec.append(ctr) # add to column width list
ctr = 0
rec.append(ctr) # last column
rec = rec[:-1] # skip newline
rec[-1] += mxlen-i-1 # last column to end of line (max len data line)
fieldwidths = rec
fmtstring = ' '.join('{}{}'.format(abs(fw),'s') for fw in fieldwidths) # 16s 16s 10s 166s 7s 5s 52s
fs = struct.Struct(fmtstring)
reqlen = mxlen #len(line)
mxlen = getmaxlinelen('data.txt')
with open('data.txt') as f:
with open('data.csv','w') as f2:
for i,line in enumerate(f.readlines()):
if i == 0: continue # SIMPLE_FILE REPORT:
if line[0] == '+': continue # skip ++++ line
if i == 1: calclens(line,mxlen) # header row,calc field positions\lengths
if len(line) < reqlen: line += ' '*(reqlen-len(line)) # line length must match longest length
fields = fs.unpack_from(str.encode(line)) # python 2
#fields = tuple(s.decode() for s in fs.unpack(line.encode())) # python 3
fields = [f.strip().replace(',','') for f in fields] # trim all fields
f2.write(','.join(fields)+'\n') # join fields comma separated
输出(data.csv)
CALL,Alias,Severity,File,Line,Wt,Message
ACT_99,ACT_99,Warning,/application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png,785,1000,message for this block ( syn_ff ) is ignored by syn
ACT_99,1111,1226,1354,1363,message for this block ( syn_ff ) is ignored by syn