使用分隔符作为空格，将数据从文本文件保存到python中的CSV文件

问题描述

我的报告文件很大。我从其中提取了所需的数据到名为“ new.txt”的新文件中我想将此数据保存在一个csv文件中，以便正确获取文件中定义的每个标头的列和行。

我无法以excel的形式获取它。我正在使用python 2.7，并且希望不使用pandas软件包来做到这一点。

SIMPLE_FILE报告：

CALL            Alias           Severity   File                                                                                                                                                                 Line   Wt   Message
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                785    1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1111   1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1226   1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1354   1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1363   1000 message for this block ( syn_ff ) is ignored by syn

这是我的代码。

import os
import sys
from os import path
import struct

outFile = "new.txt"
new_file = open(outFile,"a+")
new_file.truncate(0)
csv_file = "report.csv"
    
    
reqlen = 0  # minumum length of row
fs = None
def calclens(line):  # calc column widths from header  row
   global fs,reqlen
   rec = []  # all column widths
   ctr = 0
   for i in range(len(line)-1):
      ctr += 1
      if line[i] == ' ' and line[i+1] != ' ':  # new column
        rec.append(ctr)  # add to column width list
        ctr = 0
   rec.append(ctr) # last column
   #print rec
   rec = rec[:-1]  # skip newline  
   rec[-1] += len(line)-i-1  # last column to end of line
   
   
   fieldwidths = rec
   fmtstring = ' '.join('{}{}'.format(abs(fw),'s') for fw in fieldwidths)  # 16s 16s 10s 166s 7s 5s 52s
   fs = struct.Struct(fmtstring)
   reqlen = len(line) 
   
   
def open_file(filename):
    try:
        contents = [] 
        with open(filename,'r') as f1:
            contents=[line.strip() for line in f1]
        counter = contents.index("MORESIMPLE_FILESORT REPORT:")
        for item in contents[counter:]:
            new_file.write(item+"\n")

        with open("new.txt") as f:
            with open(csv_file,'w') as f2:
                for i,line in enumerate(f.readlines()):
                    if i == 0: continue  #  SIMPLE_FILE REPORT:
                    if line[0] == '+': continue  # skip ++++  line
                    if i == 1: calclens(line)  # header row,calc field positions\lengths
                    #print(len(line))
                    if len(line) < reqlen: line += ' '*(reqlen-len(line))

                    fields = fs.unpack_from(str.encode(line))  
                    
                    fields = [f.strip() for f in fields]  # trim all fields
                    
                    f2.write(','.join(fields)+'\n')  # join fields comma separated


    except Exception,e:
        print str(e)
    exit(1)


if __name__ == '__main__':
    name = sys.argv[1]
    filename = sys.argv[1]
    open_file(filename)

解决方法

要拆分固定宽度的文本文件，可以使用结构模块。

下面的代码使用标题行来确定列的位置和宽度，因此标题行的长度必须与数据行的长度相同，并且标题必须正确对齐。

此代码针对Python 2.7。对于Python 3，已记录了所需的更改。

import struct

ss = '''
SIMPLE_FILE REPORT:
CALL            Alias           Severity  File                                                                                                                                                                  Line   Wt   Message                                            
+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                785    1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1111   1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1226   1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1354   1000 message for this block ( syn_ff ) is ignored by syn
ACT_99          ACT_99          Warning   /application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png                                                                                                1363   1000 message for this block ( syn_ff ) is ignored by syn
'''.strip()

with open ('data.txt','w') as f: f.write(ss) # write test file

##################### Main Script #######################

reqlen = 0  # minumum length of row
fs = None

def getmaxlinelen(filename):  # scan file for longest line length
    mx = 0
    with open(filename) as f:
        for ln in f.readlines():
           if len(ln) > mx: mx = len(ln)
    return mx


def calclens(line,mxlen):  # calc column widths from header  row
   global fs,reqlen
   rec = []  # all column widths
   ctr = 0
   for i in range(len(line)-1):
      ctr += 1
      if line[i] == ' ' and line[i+1] != ' ':  # new column
          rec.append(ctr)  # add to column width list
          ctr = 0
   rec.append(ctr) # last column
   rec = rec[:-1]  # skip newline  
   rec[-1] += mxlen-i-1  # last column to end of line (max len data line)
   
   fieldwidths = rec
   fmtstring = ' '.join('{}{}'.format(abs(fw),'s') for fw in fieldwidths)  # 16s 16s 10s 166s 7s 5s 52s
   fs = struct.Struct(fmtstring)
   reqlen = mxlen  #len(line)

mxlen = getmaxlinelen('data.txt')

with open('data.txt') as f:
   with open('data.csv','w') as f2:
       for i,line in enumerate(f.readlines()):
           if i == 0: continue  #  SIMPLE_FILE REPORT:
           if line[0] == '+': continue  # skip ++++  line
           if i == 1: calclens(line,mxlen)  # header row,calc field positions\lengths
           if len(line) < reqlen: line += ' '*(reqlen-len(line)) # line length must match longest length
           fields = fs.unpack_from(str.encode(line))  # python 2
           #fields = tuple(s.decode() for s in fs.unpack(line.encode()))  # python 3
           fields = [f.strip().replace(',','') for f in fields]  # trim all fields
           f2.write(','.join(fields)+'\n')  # join fields comma separated

输出（data.csv）

CALL,Alias,Severity,File,Line,Wt,Message
ACT_99,ACT_99,Warning,/application/XX/VV/2019_2_1/VV/2019.2/data/ip/xm/xm_123/ldm/xm_123.png,785,1000,message for this block ( syn_ff ) is ignored by syn
ACT_99,1111,1226,1354,1363,message for this block ( syn_ff ) is ignored by syn

csv csv file file file python-2.7 text-files