问题描述
我需要按大约日期连接两个表,以使表old
中的每一行仅与表new
中最接近的日期一对一匹配。不允许复制new
行的副本-仅匹配一次即可获得最小的差异。
这里是一些示例:
CREATE TABLE `new` (`ID` int(2),`date` datetime,`new` varchar(1));
INSERT INTO `new` (`ID`,`date`,`new`) VALUES
(1,'2016-03-02 12:20:00','t'),(1,'2016-03-07 12:20:00','u'),'2016-04-02 12:20:00','v'),(2,'2016-04-12 11:03:00','x');
CREATE TABLE `old` (`ID` int(2),`old` varchar(1));
INSERT INTO `old` (`ID`,`old`) VALUES
(1,'a'),'b'),'2016-03-01 10:09:00','c'),'2015-04-12 10:09:00','d'),'2016-03-03 12:20:00','e');
我期望的输出是这样:
ID old.date old new.date new
1 2016-03-07 12:20:00 a 2016-03-07 12:20:00 u
1 2016-04-02 12:20:00 b 2016-04-02 12:20:00 v
1 2016-03-01 10:09:00 c NULL NULL
1 2015-04-12 10:09:00 d NULL NULL
1 2016-03-03 12:20:00 e 2016-03-02 12:20:00 t
2 NULL NULL 2016-04-12 11:03:00 x
我能够通过以下方式接近这个目标
:SELECT * FROM old A LEFT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day,A.date,B.date))<2
UNION
SELECT * FROM old A RIGHT OUTER JOIN new B ON A.ID=B.ID AND ABS(TIMESTAMPDIFF(day,B.date))<2
ORDER BY old
,但是很明显,这最终会在指定的时间范围内匹配多行,而不仅仅是最佳匹配。玩几天游戏对我来说不是一个解决方案,因为实际上我有两个巨大的表要联接,并且这需要与一个时间窗配合使用,在该时间窗中许多行将有多个匹配项。
解决方法
请考虑以下内容...
DROP TABLE IF EXISTS new;
CREATE TABLE new
(ID INT NOT NULL,date DATETIME,new CHAR(1),PRIMARY KEY(ID,date)
);
INSERT INTO new VALUES
(1,'2016-03-02 12:20:00','t'),(1,'2016-03-07 12:20:00','u'),'2016-04-02 12:20:00','v'),(2,'2016-04-12 11:03:00','x')
;
CREATE TABLE old
(ID INT NOT NULL,old CHAR(1),date)
);
INSERT INTO old VALUES
(1,'a'),'b'),'2016-03-01 10:09:00','c'),'2015-04-12 10:09:00','d'),'2016-03-03 12:20:00','e');
SELECT a.id old_id,a.date old_date,a.old,b.id new_id,b.date new_date,b.new
FROM
( SELECT old.*,MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta
FROM old
JOIN new ON new.id = old.id
GROUP
BY old.id,old.date
) a
LEFT
JOIN new b
ON b.id = a.id
AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
UNION
SELECT a.id old_id,b.new
FROM
( SELECT old.*,MIN(ABS(UNIX_TIMESTAMP(old.date)-UNIX_TIMESTAMP(new.date))) delta
FROM old
JOIN new ON new.id = old.id
GROUP
BY old.id,old.date
) a
RIGHT
JOIN new b
ON b.id = a.id
AND ABS(UNIX_TIMESTAMP(a.date)-UNIX_TIMESTAMP(b.date)) = a.delta
ORDER
BY old IS NULL,old
;
+--------+---------------------+------+--------+---------------------+------+
| old_id | old_date | old | new_id | new_date | new |
+--------+---------------------+------+--------+---------------------+------+
| 1 | 2016-03-07 12:20:00 | a | 1 | 2016-03-07 12:20:00 | u |
| 1 | 2016-04-02 12:20:00 | b | 1 | 2016-04-02 12:20:00 | v |
| 1 | 2016-03-01 10:09:00 | c | 1 | 2016-03-02 12:20:00 | t |
| 1 | 2015-04-12 10:09:00 | d | 1 | 2016-03-02 12:20:00 | t |
| 1 | 2016-03-03 12:20:00 | e | 1 | 2016-03-02 12:20:00 | t |
| NULL | NULL | NULL | 2 | 2016-04-12 11:03:00 | x |
+--------+---------------------+------+--------+---------------------+------+
对于这个难题的最后一部分-删除重复项,我可能会在应用程序代码中进行处理。
,看来,实现近似1对1连接的唯一方法是在存储过程中使用cursor
。
感谢@Strawberry向我指出正确的方向-您将在下面看到重复使用的代码段。这是最终为我工作的解决方案。它输出的记录排序不同,但至少是真正的一对一匹配。
DROP PROCEDURE IF EXISTS amerge;
DELIMITER //
CREATE PROCEDURE amerge()
BEGIN
/* Necessary declarations */
DECLARE o_ID INT DEFAULT 0;
DECLARE o_date VARCHAR(30) DEFAULT 0;
DECLARE o_old VARCHAR(2);
DECLARE o_mdiff FLOAT;
DECLARE ct INT DEFAULT 0;
DECLARE done INT DEFAULT FALSE;
DECLARE cursor1 CURSOR FOR SELECT ID,date,old,mdiff FROM t1;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done = TRUE;
/* Temporary tables */
-- copy of 'old' with diff column = min difference
CREATE TEMPORARY TABLE t1
SELECT old.*,MIN(ABS(TIMESTAMPDIFF(hour,old.date,new.date))) AS mdiff
FROM old JOIN new ON old.ID=new.ID
GROUP BY old.ID,old.date
ORDER BY mdiff ASC;
-- cartesian join with abs(diff) column
CREATE TEMPORARY TABLE t2
SELECT old.ID AS ID_1,old.date AS date_1,new.ID as ID_2,new.date AS date_2,new,ABS(TIMESTAMPDIFF(hour,new.date)) AS diff
FROM old CROSS JOIN new ON old.ID=new.ID
ORDER BY diff ASC;
-- empty table to fill in with the results
CREATE TEMPORARY TABLE t3
(id_1 INT,date_1 DATETIME,id_2 INT,date_2 DATETIME,old VARCHAR(2),new VARCHAR(2),diff FLOAT);
/* Cursor */
OPEN cursor1;
getparams: LOOP
FETCH cursor1 INTO o_ID,o_date,o_old,o_mdiff;
IF done THEN
LEAVE getparams;
END IF;
SELECT COUNT(*) FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff INTO ct;
CASE ct
WHEN 0 THEN
INSERT INTO t3 VALUES (o_ID,NULL,o_mdiff);
ELSE
INSERT INTO t3 SELECT * FROM t2 WHERE t2.ID_1=o_ID AND t2.date_1=o_date AND t2.old=o_old AND t2.diff=o_mdiff LIMIT 1;
END CASE;
DELETE FROM t2 WHERE t2.ID_2=o_ID AND t2.date_2 IN (SELECT date_2 FROM t3 WHERE t3.date_1=o_date);
END LOOP getparams;
CLOSE cursor1;
/* Workaround for error of reopening temp tables in MySQL */
DROP TEMPORARY TABLE t2;
CREATE TEMPORARY TABLE t2
SELECT * FROM t3;
/* Output */
SELECT * FROM t2
UNION
SELECT NULL AS ID_1,NULL AS date_1,NULL AS old,new.new AS new,NULL AS diff
FROM new LEFT JOIN t3 ON t3.ID_2=new.ID AND t3.date_2 = new.date WHERE t3.ID_2 IS NULL;
END //
DELIMITER ;
CALL amerge();
输出为(使用上例中的数据,并且PRIMARY键设置为ID + date):
id_1 date_1 id_2 date_2 old new diff
1 2016-03-07 12:20:00 1 2016-03-07 12:20:00 a u 0
1 2016-04-02 12:20:00 1 2016-04-02 12:20:00 b v 0
1 2016-03-03 12:20:00 1 2016-03-02 12:20:00 e t 24
1 2016-03-01 10:09:00 NULL c NULL 26
1 2015-04-12 10:09:00 NULL d NULL 7802
NULL 2 2016-04-12 11:03:00 NULL x NULL