[postgis-devel] Lame attempt at cascade union in sql

Fri Aug 8 08:51:37 PDT 2008

Regina, I'm not sure what I can really test for you - I don't have 
PostGIS on my box, so it's difficult to replicate these tests.  And 
subsetting the geometries in JTS isn't going to give the same result 
set, due to the clustering.  Or am I missing something?  Is your sample 
dataset the same one that Kevin & I are using?

Anyway, don't feel too bad - I think there's a fundamental performance 
difference between the basic union function in JTS and in GEOS.  Java is 
much more efficient at memory allocation than C++ is (at least, using 
std C++ memory operations).  So barring a stroke of genius in GEOS-land 
I think JTS is alway going to out-perform.

Also, despite my earlier sarcastic remark about processor speed, this 
*is* going to make a difference - perhaps by 2-3x?  (Although I don't 
have a screaming fast box by any means...)

M

Obe, Regina wrote:
> Paul et. al,
>
> I'm beating Kevin sometimes now.  But I guess I have no hope of beating
> Martin.
> I think its something wrong with my approximation of the difficulty of
> gluing geometries the larger they get.
>
> Anyrate from 1000 - 5000 geometries I appear to be consistently beating
> Kevin's query with this dataset.  After around 8,000 he starts passing
> me and doing the whole set still seems the same as before - stuck at
> around 4 minutes vs. Kevin's 1.25 minutes.
>
> Martin, - can you test the below on your box and tell me how it
> performs.
>
> Below is the revised union aggregate and bench marks.
>
> Thanks,
> Regina
>
> CREATE OR REPLACE FUNCTION st_unitecascade_garray(geometry[], iter int,
> startcount int)
>   RETURNS geometry AS
> $$
> SELECT CASE WHEN array_upper($1,1) < 5 OR  array_upper($1,1) BETWEEN 0
> AND ln($3 + 1)/ln($2 + 1) THEN st_unite_garray($1) ELSE
> 	st_unitecascade_garray(ARRAY(SELECT st_unite_garray($1[i:least(i
> + CAST(ln($3 + 1)/ln($2 + 1) As int) - 1,array_upper($1,1))]) As geom
> FROM generate_series(1, array_upper($1,1), CAST(ln($3 + 1)/ln($2 + 1) As
> int)) As i), $2 + 1, $3) END
>
> $$
>   LANGUAGE 'sql' IMMUTABLE;
>
> CREATE OR REPLACE FUNCTION st_unitecascade_garray_sort(geometry[])
>   RETURNS geometry AS
> $$
> 	SELECT CASE WHEN array_upper($1,1) < 51  THEN
> st_unite_garray($1) ELSE
> 		st_unitecascade_garray(ARRAY(SELECT ($1)[s] FROM
> generate_series(1, array_upper($1, 1)) AS s 
> 			ORDER BY ST_SnapToGrid(ST_Centroid(($1)[s]),0.1)
> ) , 1, array_upper($1,1)) END;
> $$
>   LANGUAGE 'sql' IMMUTABLE;
>
>
>   
> --DROP AGGREGATE st_cascadeunion(geometry);
>
> CREATE AGGREGATE st_cascadeunion(geometry) (
>   SFUNC=st_geom_accum,
>   STYPE=geometry[],
>   FINALFUNC=st_unitecascade_garray_sort
> );
>
> --BEGIN TESTS --
> CREATE INDEX idx_sample_poly_the_geom
>   ON sample_poly
>   USING gist
>   (the_geom);
>
>   
> ALTER TABLE sample_poly CLUSTER ON idx_sample_poly_the_geom;
> CLUSTER sample_poly;
>
>
> vacuum analyze sample_poly;
>
>
>
>
> --2409 ms, 2438 ms, 2422 ms )
> SELECT ST_CascadeUnion(the_geom)
> FROM (SELECT * FROM sample_poly limit 2000) As foo;
>
>
> --2578 ms, 2563 ms, 2657 ms
> SELECT ST_Union(the_geom) AS the_geom
> FROM (
>    SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>    FROM (
>      SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>      FROM (
>        SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>        FROM (
>          SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>          FROM (SELECT * FROM sample_poly limit 2000) As foo
>          GROUP BY round(id/10)
>          ORDER BY id) AS tmp1
>        GROUP BY round(id/100)
>        ORDER BY id) AS tmp2
>      GROUP BY round(id/1000)
>      ORDER BY id) AS tmp3
>    GROUP BY round(id/10000)
>    ORDER BY id) AS tmp4
> GROUP BY round(id/100000);
>
> -- 7594 ms, 7672 ms, 7718 ms
> SELECT ST_CascadeUnion(the_geom)
> FROM (SELECT * FROM sample_poly limit 5000) As foo;
>
> --8438 ms, 8343 ms, 8328 ms
> SELECT ST_Union(the_geom) AS the_geom
> FROM (
>    SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>    FROM (
>      SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>      FROM (
>        SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>        FROM (
>          SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>          FROM (SELECT * FROM sample_poly limit 5000) As foo
>          GROUP BY round(id/10)
>          ORDER BY id) AS tmp1
>        GROUP BY round(id/100)
>        ORDER BY id) AS tmp2
>      GROUP BY round(id/1000)
>      ORDER BY id) AS tmp3
>    GROUP BY round(id/10000)
>    ORDER BY id) AS tmp4
> GROUP BY round(id/100000);
>
> -- 17,921 ms, 18016 ms = .29 min
> SELECT ST_CascadeUnion(the_geom)
> FROM (SELECT * FROM sample_poly limit 8000) As foo;
>
> -- 15,296 ms 15172 ms = .25 min
> SELECT ST_Union(the_geom) AS the_geom
> FROM (
>    SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>    FROM (
>      SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>      FROM (
>        SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>        FROM (
>          SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>          FROM (SELECT * FROM sample_poly limit 8000) As foo
>          GROUP BY round(id/10)
>          ORDER BY id) AS tmp1
>        GROUP BY round(id/100)
>        ORDER BY id) AS tmp2
>      GROUP BY round(id/1000)
>      ORDER BY id) AS tmp3
>    GROUP BY round(id/10000)
>    ORDER BY id) AS tmp4
> GROUP BY round(id/100000);
>
> -- 245,497 ms = 4.09 min
> SELECT ST_CascadeUnion(the_geom)
> FROM (SELECT * FROM sample_poly) As foo;
>
> -- 77,078 ms, 75124 ms = 1.26 min
> SELECT ST_Union(the_geom) AS the_geom
> FROM (
>    SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>    FROM (
>      SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>      FROM (
>        SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>        FROM (
>          SELECT min(id) AS id, ST_Union(the_geom) AS the_geom
>          FROM (SELECT * FROM sample_poly) As foo
>          GROUP BY round(id/10)
>          ORDER BY id) AS tmp1
>        GROUP BY round(id/100)
>        ORDER BY id) AS tmp2
>      GROUP BY round(id/1000)
>      ORDER BY id) AS tmp3
>    GROUP BY round(id/10000)
>    ORDER BY id) AS tmp4
> GROUP BY round(id/100000);
>   
>
> -----Original Message-----
> From: postgis-devel-bounces at postgis.refractions.net
> [mailto:postgis-devel-bounces at postgis.refractions.net] On Behalf Of Paul
> Ramsey
> Sent: Thursday, August 07, 2008 3:55 PM
> To: PostGIS Development Discussion
> Subject: Re: [postgis-devel] Lame attempt at cascade union in sql
>
> Gods,
>
> Your last test is a close approximation to the true cascaded union,
> and that's 2 orders of magnitude faster than the naive approach.
>
> P.
>
> -----------------------------------------
> The substance of this message, including any attachments, may be
> confidential, legally privileged and/or exempt from disclosure
> pursuant to Massachusetts law. It is intended
> solely for the addressee. If you received this in error, please
> contact the sender and delete the material from any computer.
>
> _______________________________________________
> postgis-devel mailing list
> postgis-devel at postgis.refractions.net
> http://postgis.refractions.net/mailman/listinfo/postgis-devel
>
>   

-- 
Martin Davis
Senior Technical Architect
Refractions Research, Inc.
(250) 383-3022