问题描述
我做类似这样的事情得到了3倍的改进:
def group():
import numpy as np
values = np.array(np.random.randint(0,3298,size=35000000),dtype='u4')
values.sort()
dif = np.ones(values.shape,values.dtype)
dif[1:] = np.diff(values)
idx = np.where(dif>0)
vals = values[idx]
count = np.diff(idx)
解决方法
我有很多大的(>
35,000,000)整数列表,其中将包含重复项。我需要获取列表中每个整数的计数。以下代码有效,但似乎很慢。还有人可以使用Python最好是Numpy更好地进行基准测试吗?
def group():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0,1<<32,size=35000000),dtype='u4')
values.sort()
groups = ((k,len(list(g))) for k,g in groupby(values))
index = np.fromiter(groups,dtype='u4,u2')
if __name__=='__main__':
from timeit import Timer
t = Timer("group()","from __main__ import group")
print t.timeit(number=1)
返回:
$ python bench.py
111.377498865
干杯!
*根据回复进行 *编辑 :
def group_original():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0,u2')
def group_gnibbler():
import numpy as np
from itertools import groupby
values = np.array(np.random.randint(0,sum(1 for i in g)) for k,u2')
def group_christophe():
import numpy as np
values = np.array(np.random.randint(0,dtype='u4')
values.sort()
counts=values.searchsorted(values,side='right') - values.searchsorted(values,side='left')
index = np.zeros(len(values),u2')
index['f0']=values
index['f1']=counts
#Erroneous result!
def group_paul():
import numpy as np
values = np.array(np.random.randint(0,dtype='u4')
values.sort()
diff = np.concatenate(([1],np.diff(values)))
idx = np.concatenate((np.where(diff)[0],[len(values)]))
index = np.empty(len(idx)-1,u2')
index['f0']=values[idx[:-1]]
index['f1']=np.diff(idx)
if __name__=='__main__':
from timeit import Timer
timings=[
("group_original","Original"),("group_gnibbler","Gnibbler"),("group_christophe","Christophe"),("group_paul","Paul"),]
for method,title in timings:
t = Timer("%s()"%method,"from __main__ import %s"%method)
print "%s: %s secs"%(title,t.timeit(number=1))
返回:
$ python bench.py
Original: 113.385262966 secs
Gnibbler: 71.7464978695 secs
Christophe: 27.1690568924 secs
Paul: 9.06268405914 secs
尽管Christophe目前给出的结果不正确