问题描述
我有 5,000,000 个以这种方式格式化的无序字符串 (Name.Name.Day-Month-Year 24hrTime):
"John.Howard.12-11-2020 13:14"
"Diane.Barry.29-07-2020 20:50"
"Joseph.Ferns.08-05-2020 08:02"
"Joseph.Ferns.02-03-2020 05:09"
"Josephine.Fernie.01-01-2020 07:20"
"Alex.Alexander.06-06-2020 10:10"
"Howard.Jennings.07-07-2020 13:17"
"Hannah.Johnson.08-08-2020 00:49"
...
找到时间在某个 n 和 m 之间的所有字符串的最快方法是什么? (即删除所有时间 的所有字符串的最快方法
此过滤将使用不同的范围进行多次。时间范围必须始终在同一天,并且开始时间始终早于结束时间。
在 java 中,这是我目前的方法,给定一些时间字符串 M 和 N 以及一个 500 万个字符串列表:
ArrayList<String> finalSolution = new ArrayList<>();
String[] startingMtimeArr = m.split(":");
String[] startingNtimeArr = n.split(":");
Integer startingMhour = Integer.parseInt(startingMtimeArr[0]);
Integer startingMminute = Integer.parseInt(startingMtimeArr[1]);
Integer endingNhour = Integer.parseInt(startingNtimeArr[0]);
Integer endingNminute = Integer.parseInt(startingNtimeArr[1]);
for combinedString in ArraySizeOf5Million{
String[] arr = combinedString.split(".");
String[] subArr = arr[2].split(" ");
String[] timeArr = subArr[1].split(":");
String hour = timeArr[0];
String minute = timeArr[1];
If hour >= startingMhour
&& minute >= startingMminute
&& hour <= endingNhour
&& minute <= endingNminute {
finalSolution.add(hour)
}
}
Java 是我的母语,但任何其他语言也适用。更好/更快的逻辑是我所追求的
解决方法
Python 中每分钟使用索引的一些示例:
from pprint import pprint
from itertools import groupby
big_list = [
"John.Howard.12:14","Diane.Barry.13:50","xxxDiane.Barryxxx.13:50",# <-- added a name in the same HH:MM
"Joseph.Ferns.08:02","Joseph.Ferns.05:09","Josephine.Fernie.07:20","Alex.Alexander.10:10","Howard.Jennings.12:17","Hannah.Johnson.00:49",]
# 1. sort the list by time HH:MM
big_list = sorted(big_list,key=lambda k: k[-5:])
# the list is now:
# ['Hannah.Johnson.00:49',# 'Joseph.Ferns.05:09',# 'Josephine.Fernie.07:20',# 'Joseph.Ferns.08:02',# 'Alex.Alexander.10:10',# 'John.Howard.12:14',# 'Howard.Jennings.12:17',# 'Diane.Barry.13:50',# 'xxxDiane.Barryxxx.13:50']
# 2. create an index (for every minute in a day)
index = {}
times = []
for i,item in enumerate(big_list):
times.append(int(item[-5:-3]) * 60 + int(item[-2:]))
last = 0
cnt = 0
for v,g in groupby(times):
for i in range(last,v):
index[i] = [cnt,cnt]
s = sum(1 for _ in g)
index[v] = [cnt,cnt + s]
cnt += s
last = v + 1
for i in range(last,60 * 24):
index[i] = [cnt,cnt]
# 3. you can now do a fast query using the index
def find_all_strings(n,m):
n = int(n[-5:-3]) * 60 + int(n[-2:])
m = int(m[-5:-3]) * 60 + int(m[-2:])
return big_list[index[n][0] : index[m][1]]
print(find_all_strings("00:10","00:30")) # []
print(find_all_strings("00:30","00:50")) # ['Hannah.Johnson.00:49']
print(find_all_strings("12:00","13:55")) # ['John.Howard.12:14','Howard.Jennings.12:17','Diane.Barry.13:50','xxxDiane.Barryxxx.13:50']
print(find_all_strings("13:00","13:55")) # ['Diane.Barry.13:50','xxxDiane.Barryxxx.13:50']
print(find_all_strings("15:00","23:00")) # []
,
由于数据将被多次搜索,我首先解析字符串以方便多次搜索=参见by_date
。
我使用二分查找来查找特定日期的第一个字符串,然后迭代增加次数,在函数 filtered
的变量 strings_between
中收集适当的字符串。
# -*- coding: utf-8 -*-
"""
https://stackoverflow.com/questions/67562250/fastest-string-filtering-algorithm
Created on Tue May 18 09:20:11 2021
@author: Paddy3118
"""
strings = """\
John.Howard.12-11-2020 13:14
Diane.Barry.29-07-2020 20:50
Joseph.Ferns.08-05-2020 08:02
Joseph.Ferns.02-03-2020 05:09
Josephine.Fernie.01-01-2020 07:20
Alex.Alexander.06-06-2020 10:10
Howard.Jennings.07-07-2020 13:17
Hannah.Johnson.08-08-2020 00:49
Josephine.Fernie.08-08-2020 07:20
Alex.Alexander.08-08-2020 10:10
Howard.Jennings.08-08-2020 13:17
Hannah.Johnson.08-08-2020 09:49\
"""
## First parse the date information once for all future range calcs
def to_mins(hr_mn='00:00'):
hr,mn = hr_mn.split(':')
return int(hr) * 60 + int(mn)
by_date = dict() # Keys are individual days,values are time-sorted
for s in strings.split('\n'):
name_day,time = s.strip().split()
name,day = name_day.rsplit('.',1)
minutes = to_mins(time)
if day not in by_date:
by_date[day] = [(minutes,s)]
else:
by_date[day].append((minutes,s))
for day_info in by_date.values():
day_info.sort()
## Now rely on dict search for day then binary +linear search within day.
def _bisect_left(a,x):
"""Return the index where to insert item x in list a,assuming a is sorted.
The return value i is such that all e in a[:i] have e < x,and all e in
a[i:] have e >= x. So if x already appears in the list,a.insert(x) will
insert just before the leftmost x already there.
'a' is a list of tuples whose first item is assumed sorted and searched apon.
"""
lo,hi = 0,len(a)
while lo < hi:
mid = (lo+hi)//2
# Use __lt__ to match the logic in list.sort() and in heapq
if a[mid][0] < x: lo = mid+1
else: hi = mid
return lo
def strings_between(day="01-01-2020",start="00:00",finish="23:59"):
global by_date
if day not in by_date:
return []
day_data = by_date[day]
start,finish = to_mins(start),to_mins(finish)
from_index = _bisect_left(day_data,start)
filtered = []
for time,s in day_data[from_index:]:
if time <= finish:
filtered.append(s)
else:
break
return filtered
## Example data
assert by_date == {
'12-11-2020': [(794,'John.Howard.12-11-2020 13:14')],'29-07-2020': [(1250,'Diane.Barry.29-07-2020 20:50')],'08-05-2020': [(482,'Joseph.Ferns.08-05-2020 08:02')],'02-03-2020': [(309,'Joseph.Ferns.02-03-2020 05:09')],'01-01-2020': [(440,'Josephine.Fernie.01-01-2020 07:20')],'06-06-2020': [(610,'Alex.Alexander.06-06-2020 10:10')],'07-07-2020': [(797,'Howard.Jennings.07-07-2020 13:17')],'08-08-2020': [(49,'Hannah.Johnson.08-08-2020 00:49'),(440,'Josephine.Fernie.08-08-2020 07:20'),(589,'Hannah.Johnson.08-08-2020 09:49'),(610,'Alex.Alexander.08-08-2020 10:10'),(797,'Howard.Jennings.08-08-2020 13:17')]}
## Example queries from command line
"""
In [7]: strings_between('08-08-2020')
Out[7]:
['Hannah.Johnson.08-08-2020 00:49','Josephine.Fernie.08-08-2020 07:20','Hannah.Johnson.08-08-2020 09:49','Alex.Alexander.08-08-2020 10:10','Howard.Jennings.08-08-2020 13:17']
In [8]: strings_between('08-08-2020','09:30','24:00')
Out[8]:
['Hannah.Johnson.08-08-2020 09:49','Howard.Jennings.08-08-2020 13:17']
In [9]: strings_between('08-08-2020','09:49','10:10')
Out[9]: ['Hannah.Johnson.08-08-2020 09:49','Alex.Alexander.08-08-2020 10:10']
In [10]:
"""
,
正如@Paddy3118 已经指出的那样,二进制搜索可能是要走的路。
- (如果您的数据在磁盘上):加载输入数据并按日期/时间排序。
- i0 为结果集的起始索引,i1 为结果集的结束索引(均通过二分搜索获得):枚举结果条目。
我使用的代码(在 Lisp 中)显示在本答案的末尾。它没有丝毫优化(我想可以通过一些优化工作使加载和初始排序更快)。
这就是我的交互式会话的样子(包括计时信息,对于包含 500 万个条目的 foo.txt 输入文件)。
rlwrap sbcl --dynamic-space-size 2048
这是 SBCL 2.1.1.debian,ANSI Common Lisp 的一个实现。
有关 SBCL 的更多信息,请访问 http://www.sbcl.org/。
SBCL 是免费软件,按原样提供,绝对没有保证。
它主要在公共领域;某些部分在下面提供
BSD 风格的许可证。查看 CREDITS 和 COPYING 文件
分发以获取更多信息。
(ql:quickload :cl-ppcre)
加载“cl-ppcre”:
加载 1 个 ASDF 系统:
cl-ppcre
;加载“cl-ppcre”
..
(:CL-PPCRE)
(加载“fivemillion.lisp”)
T
(time (defparameter data (load-input-for-queries "foo.txt")))
“排序……”
评价:
32.091 秒实时
总运行时间 32.090620 秒(31.386722 用户,0.703898 系统)
[ 运行时间包括 2.641 秒的 GC 时间和 29.450 秒的非 GC 时间。 ]
100.00% 中央处理器
15 个 lambda 转换
115,308,171,684 个处理器周期
已占用 6,088,198,752 字节
数据
(time (defparameter output (query-interval data '(2018 1 1) '(2018 1 2)))
评价:
0.000 秒的实时
总运行时间 0.000111 秒(0.000109 用户,0.000002 系统)
100.00% 中央处理器
395,172 个处理器周期
已占用 65,536 字节
输出
(time (defparameter output (query-interval data '(2018 1 1) '(2018 1 2 8))))
评价:
0.000 秒的实时
总运行时间 0.000113 秒(0.000110 用户,0.000003 系统)
100.00% 中央处理器
399,420 个处理器周期
已占用 65,536 字节
输出
(time (defparameter output (query-interval data '(2018 1 1) '(2019 1 1)))
评价:
0.020秒实时
总运行时间 0.022469 秒(0.022469 用户,0.000000 系统)
110.00% CPU
80,800,092 个处理器周期
已占用 15,958,016 字节
输出
因此,虽然加载和排序时间(完成一次)没什么可写的(但可以优化),但 (query-interval ...)
调用非常快。查询的结果集越大,函数返回的列表越长(conses 越多,运行时间越长)。我本来可以更聪明,只返回结果集的开始和结束索引,并将条目的收集留给调用者。
这里是源代码,其中还包括生成我使用的测试数据集的代码:
(defun random-uppercase-character ()
(code-char (+ (char-code #\A) (random 26))))
(defun random-lowercase-character ()
(code-char (+ (char-code #\a) (random 26))))
(defun random-name-part (nchars)
(with-output-to-string (stream)
(write-char (random-uppercase-character) stream)
(loop repeat (- nchars 1) do
(write-char (random-lowercase-character) stream))))
(defun random-day-of-month ()
"Assumes every month has 31 days,because it does not matter
for this exercise."
(+ 1 (random 31)))
(defun random-month-of-year ()
(+ 1 (random 12)))
(defun random-year ()
"Some year between 2017 and 2022"
(+ 2017 (random 5)))
(defun random-hour-of-day ()
(random 24))
(defun random-minute-of-hour ()
(random 60))
(defun random-entry (stream)
(format stream "\"~a.~a.~d-~d-~d ~d:~d\"~%"
(random-name-part 10)
(random-name-part 10)
(random-day-of-month)
(random-month-of-year)
(random-year)
(random-hour-of-day)
(random-minute-of-hour)))
(defun generate-input (entry-count file-name)
(with-open-file (stream
file-name
:direction :output
:if-exists :supersede)
(loop repeat entry-count do
(random-entry stream))))
(defparameter *line-scanner*
(ppcre:create-scanner
"\"(\\w+).(\\w+).(\\d+)-(\\d+)-(\\d+)\\s(\\d+):(\\d+)\""))
;; 0 1 2 3 4 5 6
;; fname lname day month year hour minute
(defun decompose-line (line)
(let ((parts (nth-value
1
(ppcre:scan-to-strings
*line-scanner*
line))))
(make-array 7 :initial-contents
(list (aref parts 0)
(aref parts 1)
(parse-integer (aref parts 2))
(parse-integer (aref parts 3))
(parse-integer (aref parts 4))
(parse-integer (aref parts 5))
(parse-integer (aref parts 6))))))
(defconstant +fname-index+ 0)
(defconstant +lname-index+ 1)
(defconstant +day-index+ 2)
(defconstant +month-index+ 3)
(defconstant +year-index+ 4)
(defconstant +hour-index+ 5)
(defconstant +minute-index+ 6)
(defvar *compare-<-criteria*
(make-array 5 :initial-contents
(list +year-index+
+month-index+
+day-index+
+hour-index+
+minute-index+)))
(defun compare-< (dl1 dl2)
(labels ((comp (i)
(if (= i 5)
nil
(let ((index (aref *compare-<-criteria* i)))
(let ((v1 (aref dl1 index))
(v2 (aref dl2 index)))
(cond
((< v1 v2) t)
((= v1 v2) (comp (+ i 1)))
(t nil)))))))
(comp 0)))
(defun time-stamp-to-index (hours minutes)
(+ minutes (* 60 hours)))
(defun load-input-for-queries (file-name)
(let* ((decomposed-line-list
(with-open-file (stream file-name :direction :input)
(loop for line = (read-line stream nil nil)
while line
collect (decompose-line line))))
(number-of-lines (length decomposed-line-list))
(decomposed-line-array (make-array number-of-lines
:initial-contents
decomposed-line-list)))
(print "sorting...") (terpri)
(sort decomposed-line-array #'compare-<)))
(defun unify-date-list (date)
(let ((date-length (length date)))
(loop
for i below 5
collecting (if (> date-length i) (nth i date) 0))))
(defun decomposed-line-date<date-list (decomposed-line date-list)
(labels ((comp (i)
(if (= i 5)
nil
(let ((index (aref *compare-<-criteria* i)))
(let ((v1 (aref decomposed-line index))
(v2 (nth i date-list)))
(cond
((< v1 v2) t)
((= v1 v2) (comp (+ i 1)))
(t nil)))))))
(comp 0)))
(defun index-before (data key predicate
&key (left 0) (right (length data)))
(if (and (< left right) (> (- right left) 1))
(if (funcall predicate (aref data left) key)
(let ((mid (+ left (floor (- right left) 2))))
(if (funcall predicate (aref data mid) key)
(index-before data key predicate
:left mid
:right right)
(index-before data key predicate
:left left
:right mid)))
left)
right))
(defun query-interval (data start-date end-date)
"start-date and end-date are given as lists of the form:
'(year month day hour minute) or shorter versions e.g.
'(year month day hour),omitting trailing values which will be
appropriately defaulted."
(let ((d0 (unify-date-list start-date))
(d1 (unify-date-list end-date)))
(let* ((start-index (index-before
data
d0
#'decomposed-line-date<date-list))
(end-index (index-before
data
d1
#'decomposed-line-date<date-list
:left (cond
((< start-index 0) 0)
((>= start-index (length data))
(length data))
(t start-index)))))
(loop for i from start-index below end-index
collecting (aref data i)))))