|
drop table overlap_test; CREATE TABLE overlap_test ( id NUMBER primary key, staff_name varchar2(100), start_date DATE, end_date DATE ); -- id是主键 -- 查询相同staff有日期重叠的,从下面的看出,要查除了除了id=1,2,8的 INSERT INTO overlap_test VALUES (1, 'a',TO_DATE('01-JAN-2024','DD-MON-YYYY'), TO_DATE('05-JAN-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (2, 'a',TO_DATE('05-MAR-2024','DD-MON-YYYY'), TO_DATE('08-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (3, 'b',TO_DATE('04-MAR-2024','DD-MON-YYYY'), TO_DATE('07-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (4, 'b',TO_DATE('06-MAR-2024','DD-MON-YYYY'), TO_DATE('09-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (5, 'b',TO_DATE('06-MAR-2024','DD-MON-YYYY'), TO_DATE('07-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (6, 'c',TO_DATE('04-MAR-2024','DD-MON-YYYY'), TO_DATE('09-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (7, 'c',TO_DATE('05-MAR-2024','DD-MON-YYYY'), TO_DATE('10-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (8, 'c',TO_DATE('11-MAR-2024','DD-MON-YYYY'), TO_DATE('12-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (9, 'd',TO_DATE('01-MAR-2024','DD-MON-YYYY'), TO_DATE('12-MAR-2024','DD-MON-YYYY')); INSERT INTO overlap_test VALUES (10,'d',TO_DATE('02-MAR-2024','DD-MON-YYYY'), TO_DATE('03-MAR-2024','DD-MON-YYYY')); COMMIT; -- 建个索引,分析不同写法的执行计划 create index idx_overlap_test on overlap_test(staff_name,start_date,end_date); |
overlap重叠查找算法:
一般用子查询实现,如果用join会导致结果集重复数量增多,因为不是1对1关系,而是1行对应多行。
使用子查询自关联,比如主表a,子查询的a取个别名b,对应的有start_date,end_date以及唯一标识id以及自关联的比如这里是staff_id
overlap思想:
分为交集和包含关系两种:
1)交集
start end
| | -- 记录 a
| | -- 第二条记录 b
可以用a.end_date between b.start_date and b.end_date
查询第二条可以用a.start_date between b.start_date and a.end_date
整体可以用or关系表示
a.end_date between b.start_date and b.end_date
or
a.start_date between b.start_date and a.end_date
2) 包含关系
start end
| | -- 记录 a
| | -- 第二条 b
用1)交集算法则a查询不到的,要查询a,也即按照第二条的start_date为基准即可
a.start_date < b.start_date and a.end_date > b.start_date
如果将 交集和并集合并,简单的算法是: a的开始小于等于b的结束,a的结束大于等于b的开始
a.start_date <= b.end_date and a.end_date>= b.start_date
1: 子查询里有OR,不能展开,走FILTER,可能效率不好
1) a的开始时间在b的时间范围内 或
2) a的结束时间在b的时间范围内 或
3) a的开始时间
2:没有OR,按照条件关联比如staff_name一样的,可以走HASH JOIN,可能更好
如果将 交集和并集合并,简单的算法是: a的开始小于等于b的结束,a的结束大于等于b的开始
a.start_date <= b.end_date and a.end_date>= b.start_date
1实现: 这种子查询有or的复杂条件,不能展开,只能走FILTER,可能效率低:
|
SELECT a.* FROM overlap_test a WHERE EXISTS (SELECT 1 FROM overlap_test b WHERE ( a.start_date between b.start_date and b.end_date or a.end_date between b.start_date and b.end_date or (a.start_date < b.start_date and a.end_date > b.start_date) ) AND a.staff_name = b.staff_name AND a.id <> b.id ); ID STAFF_NAME START_DATE END_DATE ---------- -------------------- ----------------- ----------------- 3 b 20240304 00:00:00 20240307 00:00:00 4 b 20240306 00:00:00 20240309 00:00:00 5 b 20240306 00:00:00 20240307 00:00:00 6 c 20240304 00:00:00 20240309 00:00:00 7 c 20240305 00:00:00 20240310 00:00:00 9 d 20240301 00:00:00 20240312 00:00:00 10 d 20240302 00:00:00 20240303 00:00:00 7 rows selected. Elapsed: 00:00:00.02 Execution Plan ---------------------------------------------------------- Plan hash value: 3561317091 --------------------------------------------------------------------------------------------------------- | Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time | --------------------------------------------------------------------------------------------------------- | 0 | SELECT STATEMENT | | 1 | 21 | 13 (0)| 00:00:01 | |* 1 | FILTER | | | | | | | 2 | TABLE ACCESS FULL | OVERLAP_TEST | 10 | 210 | 3 (0)| 00:00:01 | |* 3 | TABLE ACCESS BY INDEX ROWID BATCHED| OVERLAP_TEST | 1 | 21 | 2 (0)| 00:00:01 | |* 4 | INDEX RANGE SCAN | IDX_OVERLAP_TEST | 1 | | 1 (0)| 00:00:01 | --------------------------------------------------------------------------------------------------------- Predicate Information (identified by operation id): --------------------------------------------------- 1 - filter( EXISTS (SELECT 0 FROM "OVERLAP_TEST" "B" WHERE "B"."STAFF_NAME"=:B1 AND "B"."ID"<>:B2 AND ("B"."START_DATE"<=:B3 AND "B"."END_DATE">=:B4 OR "B"."START_DATE"<=:B5 AND "B"."END_DATE">=:B6 OR "B"."START_DATE">:B7 AND "B"."START_DATE"<:B8))) 3 - filter("B"."ID"<>:B1) 4 - access("B"."STAFF_NAME"=:B1) filter("B"."START_DATE"<=:B1 AND "B"."END_DATE">=:B2 OR "B"."START_DATE"<=:B3 AND "B"."END_DATE">=:B4 OR "B"."START_DATE">:B5 AND "B"."START_DATE"<:B6) Statistics ---------------------------------------------------------- 0 recursive calls 0 db block gets 28 consistent gets 0 physical reads 0 redo size 988 bytes sent via SQL*Net to client 811 bytes received via SQL*Net from client 2 SQL*Net roundtrips to/from client 0 sorts (memory) 0 sorts (disk) 7 rows processed --缺少包含关系,少个id=9 SELECT a.* FROM overlap_test a WHERE EXISTS (SELECT 1 FROM overlap_test b WHERE (a.start_date between b.start_date and b.end_date or a.end_date between b.start_date and b.end_date) AND b.staff_name = a.staff_name AND b.id <> a.id); ID STAFF_NAME START_DATE END_DATE ---------- -------------------- ----------------- ----------------- 3 b 20240304 00:00:00 20240307 00:00:00 4 b 20240306 00:00:00 20240309 00:00:00 5 b 20240306 00:00:00 20240307 00:00:00 6 c 20240304 00:00:00 20240309 00:00:00 7 c 20240305 00:00:00 20240310 00:00:00 10 d 20240302 00:00:00 20240303 00:00:00 6 rows selected. |
|
SELECT a.* FROM overlap_test a WHERE EXISTS (SELECT 1 FROM overlap_test b WHERE ( a.start_date <= b.end_date and a.end_date >= b.start_date ) AND b.staff_name = a.staff_name AND b.id <> a.id); ID STAFF_NAME START_DATE END_DATE ---------- -------------------- ----------------- ----------------- 4 b 20240306 00:00:00 20240309 00:00:00 5 b 20240306 00:00:00 20240307 00:00:00 3 b 20240304 00:00:00 20240307 00:00:00 7 c 20240305 00:00:00 20240310 00:00:00 6 c 20240304 00:00:00 20240309 00:00:00 10 d 20240302 00:00:00 20240303 00:00:00 9 d 20240301 00:00:00 20240312 00:00:00 7 rows selected. Elapsed: 00:00:00.00 Execution Plan ---------------------------------------------------------- Plan hash value: 1291046832 -------------------------------------------------------------------------------------------- | Id | Operation | Name | Rows | Bytes | Cost (%CPU)| Time | -------------------------------------------------------------------------------------------- | 0 | SELECT STATEMENT | | 10 | 420 | 4 (0)| 00:00:01 | |* 1 | HASH JOIN SEMI | | 10 | 420 | 4 (0)| 00:00:01 | | 2 | VIEW | index$_join$_001 | 10 | 210 | 2 (0)| 00:00:01 | |* 3 | HASH JOIN | | | | | | | 4 | INDEX FAST FULL SCAN| IDX_OVERLAP_TEST | 10 | 210 | 1 (0)| 00:00:01 | | 5 | INDEX FAST FULL SCAN| SYS_C009213 | 10 | 210 | 1 (0)| 00:00:01 | | 6 | VIEW | index$_join$_002 | 10 | 210 | 2 (0)| 00:00:01 | |* 7 | HASH JOIN | | | | | | | 8 | INDEX FAST FULL SCAN| IDX_OVERLAP_TEST | 10 | 210 | 1 (0)| 00:00:01 | | 9 | INDEX FAST FULL SCAN| SYS_C009213 | 10 | 210 | 1 (0)| 00:00:01 | -------------------------------------------------------------------------------------------- Predicate Information (identified by operation id): --------------------------------------------------- 1 - access("B"."STAFF_NAME"="A"."STAFF_NAME") filter("A"."START_DATE"<="B"."END_DATE" AND "A"."END_DATE">="B"."START_DATE" AND "B"."ID"<>"A"."ID") 3 - access(ROWID=ROWID) 7 - access(ROWID=ROWID) Statistics ---------------------------------------------------------- 0 recursive calls 0 db block gets 17 consistent gets 0 physical reads 0 redo size 988 bytes sent via SQL*Net to client 682 bytes received via SQL*Net from client 2 SQL*Net roundtrips to/from client 0 sorts (memory) 0 sorts (disk) 7 rows processed --不能使用join,有重复数据 SELECT a.* FROM overlap_test a ,overlap_test b WHERE (a.start_date between b.start_date and b.end_date or a.end_date between b.start_date and b.end_date) AND b.staff_name = a.staff_name AND b.id <> a.id; ID STAFF_NAME START_DATE END_DATE ---------- -------------------- ----------------- ----------------- 4 b 20240306 00:00:00 20240309 00:00:00 5 b 20240306 00:00:00 20240307 00:00:00 3 b 20240304 00:00:00 20240307 00:00:00 5 b 20240306 00:00:00 20240307 00:00:00 3 b 20240304 00:00:00 20240307 00:00:00 4 b 20240306 00:00:00 20240309 00:00:00 7 c 20240305 00:00:00 20240310 00:00:00 6 c 20240304 00:00:00 20240309 00:00:00 10 d 20240302 00:00:00 20240303 00:00:00 9 rows selected. Elapsed: 00:00:00.00 |