问题描述
我有很多事件。
每个事件都表示为元组,并且包含session_id
和datetime
:
[
('aa','2020-11-08 01:00:01'),('aa','2020-11-08 01:00:03'),'2020-11-08 01:00:05'),('ab','2020-11-09 01:00:00'),'2020-11-09 01:00:05'),'2020-11-09 01:00:15')
]
我需要使用此数组计算平均会话时间。
所以我需要将此数组转换为新数组[(session_id,min(datetime),max(datetime))]
[
('aa','2020-11-08 01:00:01','2020-11-09 01:00:00','2020-11-09 01:00:15')
]
然后为每个session_id [(session_id,max(datetime) - min(datetime))]
[
('aa',4),15)
]
然后计算平均会话时间((4+15)/2) = 9.5
最好的方法是什么?
解决方法
要获得理想的结果,我将使用数据的关系表示而不是数组。
arrayJoin帮助将数组转换为关系:
SELECT avg(duration)
FROM
(
SELECT max(time) - min(time) AS duration
FROM
(
SELECT
data.1 AS id,toDateTime(data.2) AS time
FROM
(
SELECT arrayJoin([('aa','2020-11-08 01:00:01'),('aa','2020-11-08 01:00:03'),'2020-11-08 01:00:05'),('ab','2020-11-09 01:00:00'),'2020-11-09 01:00:05'),'2020-11-09 01:00:15')]) AS data
)
)
GROUP BY id
)
/*
┌─avg(duration)─┐
│ 9.5 │
└───────────────┘
*/
基于数组的决策。考虑到它可能比基于关系的解决方案要慢得多(请在选择最佳解决方案之前先对它们进行检查)。可以使用arrayReduceInRanges函数来改进此实现。
SELECT
arraySort(x -> (x.1),data) AS sorted_array,arraySplit((x,y) -> y,sorted_array,arrayMap((x,i) -> if(i = 1,1,if((x.1) = ((sorted_array[i - 1]).1),1)),arrayEnumerate(sorted_array))) AS session_arrays,arrayMap(arr -> arrayReduce('min',arrayMap(x -> (x.2),arr)),session_arrays) AS min_session_times,arrayMap(arr -> arrayReduce('max',session_arrays) AS max_session_times,arrayReduce('avg',y) -> (y - x),min_session_times,max_session_times)) AS avg
FROM
(
SELECT [('aa',toDateTime('2020-11-08 01:00:01')),toDateTime('2020-11-08 01:00:03')),toDateTime('2020-11-08 01:00:05')),toDateTime('2020-11-09 01:00:00')),toDateTime('2020-11-09 01:00:05')),toDateTime('2020-11-09 01:00:15'))] AS data
)
/*
Row 1:
──────
sorted_array: [('aa','2020-11-09 01:00:15')]
session_arrays: [[('aa','2020-11-08 01:00:05')],[('ab','2020-11-09 01:00:15')]]
min_session_times: ['2020-11-08 01:00:01','2020-11-09 01:00:00']
max_session_times: ['2020-11-08 01:00:05','2020-11-09 01:00:15']
avg: 9.5
*/
,
minMap(data.1,data.2)
select minMap(data.1,data.2) from (
SELECT [('aa','2020-11-09 01:00:15')] AS data)
┌─minMap(tupleElement(data,1),tupleElement(data,2))────────┐
│ (['aa','ab'],['2020-11-08 01:00:01','2020-11-09 01:00:00']) │
└─────────────────────────────────────────────────────────────┘
select minMap(data.1,data.2).2 as x,maxMap(data.1,data.2).2 as y,arrayMap(i,j -> toDateTime(j)-toDateTime(i),x,y) r,r) z
from (
SELECT [('aa','2020-11-09 01:00:15')] AS data)
┌─x─────────────────────────────────────────────┬─y─────────────────────────────────────────────┬─r──────┬───z─┐
│ ['2020-11-08 01:00:01','2020-11-09 01:00:00'] │ ['2020-11-08 01:00:05','2020-11-09 01:00:15'] │ [4,15] │ 9.5 │
└───────────────────────────────────────────────┴───────────────────────────────────────────────┴────────┴─────┘