[postgis-users] JOIN performance for ST_Point()

Wed Mar 4 22:46:35 PST 2020

Hello,

I had a performance issue when I JOIN two tables by using the equivalence
of two ST_Point()s.
I'm wondering if it is a kind of specification...

-- SQL statements FROM HERE
BEGIN;

SELECT version(), postgis_version();

DROP TABLE IF EXISTS table1;
DROP TABLE IF EXISTS table2;

CREATE TEMPORARY SEQUENCE gen_serial;

CREATE TABLE table1 AS (
  SELECT
    nextval('gen_serial') AS id,
    ST_Point(x, y) AS coord
  FROM (
    SELECT
      x, y
    FROM
      (SELECT generate_series(1, 100) x) set_x
    CROSS JOIN
      (SELECT generate_series(1, 100) y) set_y
  ) tmp
);

CREATE TABLE table2 AS (
  SELECT
    nextval('gen_serial') AS id,
    ST_Point(x, y) AS coord
  FROM (
    SELECT
      x, y
    FROM
      (SELECT generate_series(25, 75) x) set_x
    CROSS JOIN
      (SELECT generate_series(25, 75) y) set_y
  ) tmp
);

-- test case (1)
EXPLAIN ANALYZE
SELECT
  COUNT(*)
FROM
  table1
JOIN
  table2
ON
  ST_Equals(table1.coord, table2.coord)
;

-- test case (2)
EXPLAIN ANALYZE
SELECT
  COUNT(*)
FROM
  table1
JOIN
  table2
ON
  table1.coord = table2.coord
;

-- test case (3)
EXPLAIN ANALYZE
SELECT
  COUNT(*)
FROM
  table1
JOIN
  table2
ON
  ST_AsText(table1.coord) = ST_AsText(table2.coord)
;

END;

-- SQL statements TO HERE

-- Results FROM HERE
BEGIN
                                                             version
                                                       |
 postgis_version
----------------------------------------------------------------------------------------------------------------------------------+---------------------------------------
 PostgreSQL 11.2 (Debian 11.2-1.pgdg90+1) on x86_64-pc-linux-gnu, compiled
by gcc (Debian 6.3.0-18+deb9u1) 6.3.0 20170516, 64-bit | 2.5 USE_GEOS=1
USE_PROJ=1 USE_STATS=1
(1 row)

DROP TABLE
DROP TABLE
CREATE SEQUENCE
SELECT 10000
SELECT 2601
                                                        QUERY PLAN

--------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=7052229.97..7052229.98 rows=1 width=8) (actual
time=7096.372..7096.373 rows=1 loops=1)
   ->  Nested Loop  (cost=0.00..7052207.80 rows=8870 width=0) (actual
time=1692.069..7096.193 rows=2601 loops=1)
         Join Filter: ((table1.coord ~= table2.coord) AND
_st_equals(table1.coord, table2.coord))
         Rows Removed by Join Filter: 26007399
         ->  Seq Scan on table1  (cost=0.00..184.80 rows=10080 width=32)
(actual time=0.004..0.767 rows=10000 loops=1)
         ->  Materialize  (cost=0.00..61.60 rows=2640 width=32) (actual
time=0.000..0.090 rows=2601 loops=10000)
               ->  Seq Scan on table2  (cost=0.00..48.40 rows=2640
width=32) (actual time=0.004..0.244 rows=2601 loops=1)
 Planning Time: 0.136 ms
 Execution Time: 7096.493 ms
(9 rows)

                                                        QUERY PLAN

--------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=399474.33..399474.34 rows=1 width=8) (actual
time=6870.980..6870.980 rows=1 loops=1)
   ->  Nested Loop  (cost=0.00..399407.80 rows=26611 width=0) (actual
time=1665.642..6870.790 rows=2601 loops=1)
         Join Filter: (table1.coord = table2.coord)
         Rows Removed by Join Filter: 26007399
         ->  Seq Scan on table1  (cost=0.00..184.80 rows=10080 width=32)
(actual time=0.004..0.870 rows=10000 loops=1)
         ->  Materialize  (cost=0.00..61.60 rows=2640 width=32) (actual
time=0.000..0.099 rows=2601 loops=10000)
               ->  Seq Scan on table2  (cost=0.00..48.40 rows=2640
width=32) (actual time=0.004..0.258 rows=2601 loops=1)
 Planning Time: 0.031 ms
 Execution Time: 6871.063 ms
(9 rows)

                                                          QUERY PLAN

------------------------------------------------------------------------------------------------------------------------------
 Aggregate  (cost=512255.20..512255.21 rows=1 width=8) (actual
time=44.998..44.998 rows=1 loops=1)
   ->  Merge Join  (cost=1053.52..511922.56 rows=133056 width=0) (actual
time=42.285..44.865 rows=2601 loops=1)
         Merge Cond: ((st_astext(table2.coord)) = (st_astext(table1.coord)))
         ->  Sort  (cost=198.44..205.04 rows=2640 width=32) (actual
time=4.408..4.514 rows=2601 loops=1)
               Sort Key: (st_astext(table2.coord))
               Sort Method: quicksort  Memory: 300kB
               ->  Seq Scan on table2  (cost=0.00..48.40 rows=2640
width=32) (actual time=0.018..3.587 rows=2601 loops=1)
         ->  Sort  (cost=855.08..880.28 rows=10080 width=32) (actual
time=37.466..37.760 rows=7329 loops=1)
               Sort Key: (st_astext(table1.coord))
               Sort Method: quicksort  Memory: 1166kB
               ->  Seq Scan on table1  (cost=0.00..184.80 rows=10080
width=32) (actual time=0.005..13.276 rows=10000 loops=1)
 Planning Time: 0.088 ms
 Execution Time: 45.104 ms
(13 rows)

COMMIT
-- Results TO HERE

The above three SQL statements, test case (1), (2), and (3) provides
exactly the same result, i.e. count = 2601.

The test case (1) uses ST_Equals and (2) uses '=' for comparison.
The test case (3) converts a ST_Point to text by using ST_AsText().
The test case (1)(2) is more intuitive and straightforward than the test
case (3).
However, the test case (3) is much faster than the test cases (1) and (2).

As described in QUERY PLAN, only the test case (3) could benefit from
Sort-Merge-Join.
Is it possible to get the equivalent performance with test case (1) and (2)?

Best regards,
Ryo Neyama