问题描述
我有一个C库,它从文件中读取二进制数据,将其转换并将所有内容存储在大char *中,以将数据返回给调用它的任何对象。在C中可以正常工作,但是使用python / Cython时我遇到了分配内存的问题。
库原型为:
int readWrapper(struct options opt,char *lineOut);
我的pyx文件有:
from libc.string cimport strcpy,memset
from libc.stdlib cimport malloc,free
from libc.stdio cimport printf
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
options opt
int readWrapper(options opt,char *lineOut);
def pyreader(file,date,debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
# Size of array
outSize = 50000
cdef char *line_output = <char *> malloc(outSize * sizeof(char))
memset(line_output,1,outSize)
line_output[outSize] = 0
# Call reader
return_val = readWrapper(options,line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8','strict'))
df = pd.read_csv(data,delim_whitespace=True,header=None)
# Free memory
free(line_output)
return df
只要line_output保持在outSize
的范围内,此方法就可以正常工作。但是有些文件较大,那么如何动态地执行此操作?
根据DavidW的建议进行编辑
读者包装器就像:
int readWrapper(struct options opt,char **lineOut)
{
// Open file for reading
fp = fopen(opt.filename,"r");
// Check for valid fp
if (fp == NULL)
{
printf("file pointer is null,aborting\n");
return (EXIT_FAILURE);
}
// Allocate memory
int ARRAY_SIZE = 5000;
*lineOut = NULL;
char *outLine = malloc(ARRAY_SIZE * sizeof (char));
if (outLine == NULL)
{
fprintf(stderr,"Memory allocation failed!");
return(EXIT_FAILURE);
}
// Create line and multi lines object
char line[255];
int numWritten = 0;
int memIncrease = 10000;
while (fp != feof)
{
// Read part of file
reader(fp,opt,line);
size_t num2Write = strlen(line);
if (ARRAY_SIZE < (numWritten + num2Write + 1))
{ // Won't fit so enlarge outLine
ARRAY_SIZE += memIncrease;
outLine = realloc(outLine,(sizeof *outLine * ARRAY_SIZE));
if (outLine == NULL)
{
fprintf(stderr,"Memory re-allocation failed!");
return(EXIT_FAILURE);
}
sprintf(outLine + numWritten,"%s",line);
numWritten += num2Write;
}
} // data block loop
*lineOut = outLine;
if (fp != NULL)
{
fclose(fp);
}
return (EXIT_SUCCESS);
}
新的pyx文件:
from libc.string cimport strcpy,debug=0):
import logging
cdef options options
# Get the filename
options.filename = <char *>malloc(len(file) * sizeof(char))
options.debug = debug
cdef char *line_output = NULL
# Call reader
return_val = readWrapper(options,&line_output)
# Create dataframe
from io import StringIO
data = StringIO(line_output.decode('UTF-8','strict'))
df = pd.read_csv(data,header=None)
# Free memory
free(line_output)
free(options.filename)
return df
这现在很好用,但是在包装器(C)和python(pyx)部分中使用任何printf
或fprintf(stdout,...)
语句都会导致
Exception ignored in: <_io.TextIOWrapper name='<stdout>' mode='w' encoding='utf-8'>
BrokenPipeError: [Errno 32] Broken pipe
使用python3 test.py | head
时。没有头部,不会显示错误。
最后,关于文件名及其分配的建议对我也不起作用。使用options.filename = file
会在运行时生成TypeError: expected bytes,str found
,但会编译。有趣的是,这仅在我运行如下这样调用包装程序的python代码时发生:
python3 test.py | head
。没有管道和头部,则不存在BrokenPipeError。因此,这没什么大不了,但是想了解是什么原因造成的。
编辑
此BrokenPipeError问题发生在头部而不是尾部。有关此“错误”的说明,请参见:https://stackoverflow.com/a/30091579/2885280
解决方案pyx文件:
与上述readWrapper.c一起使用的最终reader.pyx文件。内存分配由C处理,清理(最后)由pyx代码处理。
from libc.stdlib cimport free
cdef extern from "reader.h":
struct options:
int debug;
char *filename;
char *DAY;
options opt
int readWrapper(options opt,char **lineOut);
def pyreader(file,debug=0):
import logging
import sys
import errno
import pandas as pd
# Init return valus
a = pd.DataFrame()
cdef options options
cdef char *line_output = NULL
# logging
logging.basicConfig(stream=sys.stdout,format='%(asctime)s:%(process)d:%(filename)s:%(lineno)s:pyreader: %(message)s',datefmt='%Y%m%d_%H.%M.%S',level=logging.DEBUG if debug > 0 else logging.INFO)
try:
# Check inputs
if file is None:
raise Exception("No valid filename provided")
if date is None:
raise Exception("No valid date provided")
# Get the filename
file_enc = file.encode("ascii")
options.filename = file_enc
# Get date
day_enc = date.encode('ascii')
options.DAY = day_enc
try:
# Call reader
return_val = readWrapper(options,&line_output)
if (return_val > 0):
logging.error("pyreadASTERIX2 failed with exitcode {}".format(return_val))
return a
except Exception:
logging.exception("Error occurred")
free(line_output)
return a
from io import StringIO
try:
data = StringIO(line_output.decode('UTF-8','strict'))
logging.debug("return_val: {} and size: {}".format(return_val,len(line_output.decode('UTF-8','strict'))))
a = pd.read_csv(data,header=None,dtype={'id':str})
if a.empty:
logging.error("failed to load {} not enough data to construct DataFrame".format(file))
return a
logging.debug("converted data into pd")
except Exception as e:
logging.exception("Exception occured while loading: {} into DataFrame".format(file))
return a
finally:
free(line_output)
logging.debug("Size of df: {}".format(len(a)))
# Success,return DataFrame
return a
except Exception:
logging.exception("pyreader returned with an exception:")
return a
解决方法
您有两个基本选择:
-
了解如何提前计算尺寸。
size = calculateSize(...) # for example,by pre-reading the file line_output = <char*>malloc(size) return_val = readWrapper(options,line_output)
-
具有
readWrapper
负责分配内存。 C中有两种常用模式:a。返回一个指针(也许使用
NULL
来指示错误):char* readWrapper(options opt)
b。将指针传递给指针并进行更改
// C int readWrapper(options opt,char** str_out) { // work out the length *str_out = malloc(length); // etc } # Cython char* line_out return_value = readWrapper(options,&line_out)
您需要确保清除分配的所有字符串。您仍然有options.filename
的内存泄漏。对于options.filename
,最好通过Cython获得指向file
内容的指针。只要存在file
,这就有效,因此您不需要分配
options.filename = file
只需确保options
不超过file
(即,它不会存储起来供以后在C中的任何地方使用)。
一般
something = malloc(...)
try:
# code
finally:
free(something)
是确保清理的良好模式。