[postgis-tickets] [SCM] PostGIS branch master updated. 3.3.0rc2-964-g06bed87f3

git at osgeo.org git at osgeo.org
Mon Jun 12 00:32:47 PDT 2023


This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "PostGIS".

The branch, master has been updated
       via  06bed87f309bc0c207b5d0ca22680d368c7347ed (commit)
      from  5e05bc3e7461e19055f16af367c6fc9260222179 (commit)

Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.

- Log -----------------------------------------------------------------
commit 06bed87f309bc0c207b5d0ca22680d368c7347ed
Author: Regina Obe <lr at pcorp.us>
Date:   Mon Jun 12 03:30:06 2023 -0400

    debug_standardize_address function for debugging
    address standardizer rules.  Json formatted output
    References #5397
    
    Includes
    - rulestring (currently missing the rule type and weight
    - tokens which includes input words and corresponding standardize output
    - the stdaddr (best guess) address

diff --git a/NEWS b/NEWS
index a59859ac2..a6fc1e74e 100644
--- a/NEWS
+++ b/NEWS
@@ -17,6 +17,7 @@ xxxx/xx/xx
   - #5336, topogeometry cast to topoelement support (Regina Obe)
   - Allow singleton geometry to be inserted into Geometry(Multi*) columns (Paul Ramsey)
   - GH721, New window-based ST_ClusterWithinWin and ST_ClusterIntersectingWin (Paul Ramsey)
+  - #5397, [address_standardizer] debug_standardize_address function (Regina Obe)
 
 * Enhancements *
   - #5194, do not update system catalogs from postgis_extensions_upgrade (Sandro Santilli)
diff --git a/doc/extras_address_standardizer.xml b/doc/extras_address_standardizer.xml
index f70a6d4f9..9104eac04 100644
--- a/doc/extras_address_standardizer.xml
+++ b/doc/extras_address_standardizer.xml
@@ -565,33 +565,160 @@ into includes in the future for easier maintenance.</para></listitem>
   </sect2>
 
   <sect2 id="Address_Standardizer_Functions"><title>Address Standardizer Functions</title>
-		<refentry id="parse_address">
-		<refnamediv>
-			<refname>parse_address</refname>
-
-			<refpurpose>Takes a 1 line address and breaks into parts</refpurpose>
-		</refnamediv>
-
-		<refsynopsisdiv>
-			<funcsynopsis>
-				 <funcprototype>
-					<funcdef>record <function>parse_address</function></funcdef>
-					<paramdef><type>text </type> <parameter>address</parameter></paramdef>
-				</funcprototype>
-
-			</funcsynopsis>
-		</refsynopsisdiv>
-
-		<refsection>
-			<title>Description</title>
 
-			<para>Returns takes an address as input, and returns a record output consisting of fields <emphasis>num</emphasis>, <emphasis>street</emphasis>, <emphasis>street2</emphasis>,
-			<emphasis>address1</emphasis>, <emphasis>city</emphasis>, <emphasis>state</emphasis>, <emphasis>zip</emphasis>, <emphasis>zipplus</emphasis>, <emphasis>country</emphasis>.</para>
-
-			<!-- use this format if new function -->
-		<para>Availability: 2.2.0</para>
-		<para>&address_standardizer_required;</para>
-		</refsection>
+        <refentry id="debug_standardize_address">
+        <refnamediv>
+            <refname>debug_standardize_address</refname>
+
+            <refpurpose>Returns a json formatted text listing the parse tokens and standardizations</refpurpose>
+        </refnamediv>
+
+        <refsynopsisdiv>
+            <funcsynopsis>
+                <funcprototype>
+                    <funcdef>text <function>debug_standardize_address</function></funcdef>
+                    <paramdef><type>text </type> <parameter>lextab</parameter></paramdef>
+                    <paramdef><type>text </type> <parameter>gaztab</parameter></paramdef>
+                    <paramdef><type>text </type> <parameter>rultab</parameter></paramdef>
+                    <paramdef><type>text </type> <parameter>micro</parameter></paramdef>
+                    <paramdef choice="opt"><type>text </type> <parameter>macro=NULL</parameter></paramdef>
+                </funcprototype>
+            </funcsynopsis>
+        </refsynopsisdiv>
+
+        <refsection>
+            <title>Description</title>
+
+            <para>This is a function for debugging address standardizer rules and lex/gaz mappings.  It returns a json formatted text that includes the matching rules, mapping of tokens, and best standardized address <xref linkend="stdaddr" /> form of an input address utilizing <xref linkend="lextab" /> table name, <xref linkend="gaztab" />, and <xref linkend="rulestab" /> table names and an address.</para>
+
+            <para>For single line addresses use just <varname>micro</varname></para>
+            <para>For two line address A <varname>micro</varname> consisting of standard first line of postal address e.g. <code>house_num street</code>, and a macro consisting of standard postal second line of an address e.g <code>city, state postal_code country</code>.</para>
+
+            <!-- use this format if new function -->
+        <para>Availability: 3.4.0</para>
+        <para>&address_standardizer_required;</para>
+        </refsection>
+
+
+        <refsection>
+            <title>Examples</title>
+            <para>Using address_standardizer_data_us extension</para>
+            <programlisting>CREATE EXTENSION address_standardizer_data_us; -- only needs to be done once</programlisting>
+
+        <para>Variant 1: Single line address.  This doesn't work well with non-US addresses</para>
+        <programlisting>SELECT s::jsonb
+            FROM debug_standardize_address('us_lex',
+                'us_gaz', 'us_rules', 'One Devonshire Place, PH 301, Boston, MA 02109') AS s;</programlisting>
+            <screen><![CDATA[{
+  "macro": "Boston,MA,02109,US,",
+  "micro": "One Devonshire Place, PH 301",
+  "rules": [
+    {
+      "no": 0,
+      "score": 0.87625,
+      "raw_score": 0.87625,
+      "rule_string": "0 1 2 17 0 -1 1 5 6 17 17",
+      "rule_tokens": [
+        {
+          "pos": 0,
+          "input-word": "ONE",
+          "input-token": "NUMBER",
+          "mapped-word": "1",
+          "output-token": "HOUSE",
+          "input-token-code": 0,
+          "output-token-code": 1
+        },
+        {
+          "pos": 1,
+          "input-word": "DEVONSHIRE",
+          "input-token": "WORD",
+          "mapped-word": "DEVONSHIRE",
+          "output-token": "STREET",
+          "input-token-code": 1,
+          "output-token-code": 5
+        },
+        :
+      ]
+    },
+        :
+      ]
+    }
+  ],
+  "stdaddr": {
+    "box": null,
+    "city": "BOSTON",
+    "name": "DEVONSHIRE",
+    "qual": null,
+    "unit": "# PENTHOUSE 301",
+    "extra": null,
+    "state": "MASSACHUSETTS",
+    "predir": null,
+    "sufdir": null,
+    "country": "USA",
+    "pretype": null,
+    "suftype": "PLACE",
+    "building": null,
+    "postcode": "02109",
+    "house_num": "1",
+    "ruralroute": null
+  },
+  "input_tokens": [
+    {
+      "pos": 0,
+      "word": "ONE",
+      "token": "NUMBER",
+      "stdword": "1",
+      "token-code": 0
+    },
+    {
+      "pos": 0,
+      "word": "ONE",
+      "token": "WORD",
+      "stdword": "1",
+      "token-code": 1
+    },
+	:
+  ]
+}
+}]]></screen>
+
+        </refsection>
+
+        <!-- Optionally add a "See Also" section -->
+        <refsection>
+            <title>See Also</title>
+
+            <para><xref linkend="stdaddr" />, <xref linkend="rulestab" />, <xref linkend="lextab" />, <xref linkend="gaztab" />, <xref linkend="Pagc_Normalize_Address" /></para>
+        </refsection>
+        </refentry>
+
+        <refentry id="parse_address">
+        <refnamediv>
+            <refname>parse_address</refname>
+
+            <refpurpose>Takes a 1 line address and breaks into parts</refpurpose>
+        </refnamediv>
+
+        <refsynopsisdiv>
+            <funcsynopsis>
+                    <funcprototype>
+                    <funcdef>record <function>parse_address</function></funcdef>
+                    <paramdef><type>text </type> <parameter>address</parameter></paramdef>
+                </funcprototype>
+
+            </funcsynopsis>
+        </refsynopsisdiv>
+
+        <refsection>
+            <title>Description</title>
+
+            <para>Returns takes an address as input, and returns a record output consisting of fields <emphasis>num</emphasis>, <emphasis>street</emphasis>, <emphasis>street2</emphasis>,
+            <emphasis>address1</emphasis>, <emphasis>city</emphasis>, <emphasis>state</emphasis>, <emphasis>zip</emphasis>, <emphasis>zipplus</emphasis>, <emphasis>country</emphasis>.</para>
+
+            <!-- use this format if new function -->
+        <para>Availability: 2.2.0</para>
+        <para>&address_standardizer_required;</para>
+        </refsection>
 
 
 		<refsection>
diff --git a/extensions/address_standardizer/address_standardizer.c b/extensions/address_standardizer/address_standardizer.c
index 0174924be..b4375dfdf 100644
--- a/extensions/address_standardizer/address_standardizer.c
+++ b/extensions/address_standardizer/address_standardizer.c
@@ -18,7 +18,7 @@ PG_MODULE_MAGIC;
 
 Datum standardize_address(PG_FUNCTION_ARGS);
 Datum standardize_address1(PG_FUNCTION_ARGS);
-
+Datum debug_standardize_address(PG_FUNCTION_ARGS);
 
 /*
  * The signature for standardize_address follows. The lextab, gaztab and
@@ -52,6 +52,266 @@ Datum standardize_address1(PG_FUNCTION_ARGS);
  *
  *    rule text
 */
+PG_FUNCTION_INFO_V1(debug_standardize_address);
+
+Datum
+debug_standardize_address(PG_FUNCTION_ARGS)
+{
+    STANDARDIZER        *std;
+    char                *lextab;
+    char                *gaztab;
+    char                *rultab;
+    char                *micro;
+    char                *macro;
+    //Datum                result;
+    STDADDR             *stdaddr;
+    char               **values;
+	int k;
+    char rule_in[100];
+	char rule_out[100];
+	char temp[10];
+    int stz_no , n ;
+	//SEG *rseg;
+	StringInfo	result  = makeStringInfo();
+
+	elog(DEBUG2, "Start %s", __func__);
+	initStringInfo(result);
+
+	appendStringInfoChar(result, '{');
+
+    lextab = text_to_cstring(PG_GETARG_TEXT_P(0));
+    gaztab = text_to_cstring(PG_GETARG_TEXT_P(1));
+    rultab = text_to_cstring(PG_GETARG_TEXT_P(2));
+    micro  = text_to_cstring(PG_GETARG_TEXT_P(3));
+
+	if ( (PG_NARGS()  > 4) && (!PG_ARGISNULL(4)) ) {
+		macro  = text_to_cstring(PG_GETARG_TEXT_P(4));
+	}
+	else {
+		ADDRESS             *paddr;
+		HHash               *stH;
+		stH = (HHash *) palloc0(sizeof(HHash));
+		int                  err;
+		if (!stH) {
+			elog(ERROR, "%s: Failed to allocate memory for hash!", __func__);
+			return -1;
+		}
+
+		elog(DEBUG1, "going to load_state_hash");
+
+		err = load_state_hash(stH);
+		if (err) {
+			elog(DEBUG2, "got err=%d from load_state_hash().", err);
+#ifdef USE_HSEARCH
+			elog(DEBUG2, "calling hdestroy_r(stH).");
+			hdestroy_r(stH);
+#endif
+			elog(ERROR, "standardize_address: load_state_hash() failed(%d)!", err);
+			return -1;
+		}
+
+		elog(DEBUG2, "calling parseaddress()");
+		paddr = parseaddress(stH, micro, &err);
+
+		if (!paddr) {
+			elog(ERROR, "parse_address: parseaddress() failed!");
+			return -1;
+		}
+
+		/* check for errors and comput length of macro string */
+		if (paddr->street2)
+			elog(ERROR, "standardize_address() can not be passed an intersection.");
+		if (! paddr-> address1)
+			elog(ERROR, "standardize_address() could not parse the address into components.");
+
+		k = 1;
+		if (paddr->city) k += strlen(paddr->city) + 1;
+		if (paddr->st)   k += strlen(paddr->st)   + 1;
+		if (paddr->zip)  k += strlen(paddr->zip)  + 1;
+		if (paddr->cc)   k += strlen(paddr->cc)   + 1;
+
+		/* create micro and macro from paddr */
+		micro = pstrdup(paddr->address1);
+		macro = (char *) palloc(k * sizeof(char));
+		*macro = '\0';
+		if (paddr->city) { strcat(macro, paddr->city); strcat(macro, ","); }
+		if (paddr->st  ) { strcat(macro, paddr->st  ); strcat(macro, ","); }
+		if (paddr->zip ) { strcat(macro, paddr->zip ); strcat(macro, ","); }
+		if (paddr->cc  ) { strcat(macro, paddr->cc  ); strcat(macro, ","); }
+	}
+	appendStringInfoString(result, "\"micro\":");
+	appendStringInfo(result, "%s", quote_identifier(micro));
+	appendStringInfoString(result, ",");
+
+	appendStringInfoString(result, "\"macro\":");
+	appendStringInfo(result, "%s", quote_identifier(macro));
+	appendStringInfoString(result, ",");
+
+    std = GetStdUsingFCInfo(fcinfo, lextab, gaztab, rultab);
+    if (!std)
+        elog(ERROR, "%s failed to create the address standardizer object!",  __func__);
+
+	//output_rule_statistics( std->pagc_p->rules, std->err_p );
+	STAND_PARAM *ms = std->misc_stand;
+	elog(DEBUG2, "%s: calling std_standardize_mm('%s', '%s')", __func__ , micro, macro);
+    stdaddr = std_standardize_mm( std, micro, macro, 0 );
+	elog(DEBUG2, "%s back from fetch_stdaddr",  __func__);
+
+	DEF *__def__ ;
+    STZ **__stz_list__;
+
+	STZ_PARAM *__stz_info__ = ms->stz_info ;
+
+	int lex_pos;
+	elog(DEBUG2, "Input tokenization candidates:\n");
+	appendStringInfoString(result, "\"input_tokens\":[");
+	int started = 0;
+	for (lex_pos = FIRST_LEX_POS;lex_pos < ms->LexNum;lex_pos ++)
+	{
+
+		for ( __def__ = ms->lex_vector[lex_pos].DefList; __def__ != NULL; __def__ = __def__->Next)
+		{
+			if (started > 0 ){
+				appendStringInfoChar(result, ',');
+			}
+			appendStringInfo(result, "{\"pos\": %u,", lex_pos);
+			appendStringInfoString(result, "\"word\":");
+			appendStringInfoString(result, quote_identifier(ms->lex_vector[lex_pos].Text) );
+			appendStringInfoString(result, ",\"stdword\":");
+			appendStringInfoString(result, quote_identifier(((__def__->Protect )? ms->lex_vector[lex_pos].Text : __def__->Standard)) );
+			appendStringInfoString(result, ",\"token\":");
+			appendStringInfoString(result, quote_identifier(in_symb_name(__def__->Type)) );
+			appendStringInfo(result, ",\"token-code\": %u}", __def__->Type);
+			elog(DEBUG2, "\t(%d) stdword: %s, tok: %d (%s)\n",lex_pos,((__def__->Protect )? ms->lex_vector[lex_pos].Text : __def__->Standard),__def__->Type,in_symb_name(__def__->Type));
+			started++;
+
+		}
+	}
+	appendStringInfoChar(result, ']');
+
+	n = __stz_info__->stz_list_size ;
+	__stz_list__ = __stz_info__->stz_array ;
+	started = 0;
+
+	appendStringInfoString(result, ", \"rules\":[");
+	for ( stz_no = 0 ; stz_no < n ; stz_no ++ )
+	{
+		if  (stz_no > 0 ){
+			appendStringInfoChar(result, ',');
+		}
+		strcpy(rule_in, "");
+		strcpy(rule_out, "");
+
+		STZ *__cur_stz__ = __stz_list__[stz_no] ;
+
+		KW *ruleref = __cur_stz__->build_key;
+		elog( DEBUG2, "Raw standardization %d with score %f:\n" , ( stz_no  ) , __cur_stz__->score ) ;
+		appendStringInfo(result, "{\"score\": %f,", __cur_stz__->score);
+		appendStringInfo(result, "\"raw_score\": %f,", __cur_stz__->raw_score);
+		appendStringInfo(result, "\"no\": %d,", stz_no);
+
+		appendStringInfoString(result, "\"rule_tokens\":[");
+		started = 0;
+		for ( lex_pos = FIRST_LEX_POS ; lex_pos < ms->LexNum ; lex_pos ++ )
+		{
+			SYMB k2;
+			__def__ = __cur_stz__->definitions[lex_pos];
+			k2 = __cur_stz__->output[lex_pos] ;
+			if (started > 0){
+				appendStringInfoChar(result, ',');
+				strcat(rule_out, " ");
+				strcat(rule_in, " ");
+			}
+			sprintf(temp, "%d", __def__->Type);
+			strcat(rule_in, temp);
+			sprintf(temp, "%d", k2);
+			strcat(rule_out, temp);
+
+			appendStringInfo(result, "{\"pos\": %u,", lex_pos);
+			appendStringInfo(result, "\"input-token-code\": %d,",  __def__->Type);
+			appendStringInfo(result, "\"input-token\": %s,", quote_identifier( in_symb_name( __def__->Type ) ) );
+			appendStringInfo(result, "\"input-word\": %s,", quote_identifier(ms->lex_vector[lex_pos].Text) );
+			appendStringInfo(result, "\"mapped-word\": %s,",
+				quote_identifier((( __def__->Protect )? ms->lex_vector[lex_pos].Text : __def__->Standard )) );
+			appendStringInfo(result, "\"output-token-code\": %d,",  k2);
+			appendStringInfo(result, "\"output-token\": %s", quote_identifier( out_symb_name( k2 ) ) );
+			appendStringInfoChar(result, '}');
+			elog( DEBUG2, "\t(%d) Input %d (%s) text %s mapped to output %d (%s)\n" , lex_pos , __def__->Type , in_symb_name( __def__->Type ) , (( __def__->Protect )? ms->lex_vector[lex_pos].Text : __def__->Standard ) , k2 , (( k2 == FAIL )? "NONE" : out_symb_name( k2 ))) ;
+			started++;
+			if ( k2 == FAIL ) break ;
+		}
+
+		appendStringInfo(result, "], \"rule_string\":\"%s",rule_in);
+		appendStringInfoString(result, " -1 ");
+		appendStringInfo(result, "%s",rule_out);
+
+		/**
+		 * TODO:  Figure out how to add the type and weight rule type
+		 * **/
+		//appendStringInfo(result, " %d", ruleref->Type);
+		//elog(DEBUG2, "Rule  type  %d",  ruleref->Type);
+		/** rule weight **/
+		//appendStringInfo(result, " %d", ruleref->Weight);
+
+		appendStringInfoString(result, "\"}");
+	}
+	appendStringInfoChar(result, ']');
+	elog(DEBUG2, "%s: setup values json", __func__);
+	appendStringInfoString(result, ",\"stdaddr\": {");
+    if (stdaddr) {
+        appendStringInfo(result, "\"building\": %s", (stdaddr->building ?
+				quote_identifier(pstrdup(stdaddr->building)) : "null") );
+
+		appendStringInfo(result, ",\"house_num\": %s", (stdaddr->house_num ?
+				quote_identifier(pstrdup(stdaddr->house_num)) : "null") );
+
+		appendStringInfo(result, ",\"predir\": %s", (stdaddr->predir ?
+				quote_identifier(pstrdup(stdaddr->predir)) : "null") );
+
+		appendStringInfo(result, ",\"qual\": %s", (stdaddr->qual ?
+				quote_identifier(pstrdup(stdaddr->qual)) : "null") );
+
+		appendStringInfo(result, ",\"pretype\": %s", (stdaddr->pretype ?
+				quote_identifier(pstrdup(stdaddr->pretype)) : "null") );
+
+		appendStringInfo(result, ",\"name\": %s", (stdaddr->name ?
+				quote_identifier(pstrdup(stdaddr->name)) : "null") );
+
+		appendStringInfo(result, ",\"suftype\": %s", (stdaddr->suftype ?
+				quote_identifier(pstrdup(stdaddr->suftype)) : "null") );
+
+		appendStringInfo(result, ",\"sufdir\": %s", (stdaddr->sufdir ?
+				quote_identifier(pstrdup(stdaddr->sufdir)) : "null") );
+
+		appendStringInfo(result, ",\"ruralroute\": %s", (stdaddr->ruralroute ?
+			quote_identifier(pstrdup(stdaddr->ruralroute)) : "null") );
+
+		appendStringInfo(result, ",\"extra\": %s", (stdaddr->extra ?
+		quote_identifier(pstrdup(stdaddr->extra)) : "null") );
+
+		appendStringInfo(result, ",\"city\": %s", (stdaddr->city ?
+			quote_identifier(pstrdup(stdaddr->city)) : "null") );
+
+		appendStringInfo(result, ",\"state\": %s", (stdaddr->state ?
+			quote_identifier(pstrdup(stdaddr->state)) : "null") );
+
+		appendStringInfo(result, ",\"country\": %s", (stdaddr->country ?
+			quote_identifier(pstrdup(stdaddr->country)) : "null") );
+
+		appendStringInfo(result, ",\"postcode\": %s", (stdaddr->postcode ?
+			quote_identifier(pstrdup(stdaddr->postcode)) : "null") );
+
+		appendStringInfo(result, ",\"box\": %s", (stdaddr->box ?
+			quote_identifier(pstrdup(stdaddr->box)) : "null") );
+
+		appendStringInfo(result, ",\"unit\": %s", (stdaddr->unit ?
+			quote_identifier(pstrdup(stdaddr->unit)) : "null") );
+    }
+    stdaddr_free(stdaddr);
+	appendStringInfoString(result, "}");
+	appendStringInfoString(result, "}");
+	PG_RETURN_TEXT_P(cstring_to_text_with_len(result->data, result->len));
+}
 
 PG_FUNCTION_INFO_V1(standardize_address);
 
diff --git a/extensions/address_standardizer/address_standardizer_functions.sql.in b/extensions/address_standardizer/address_standardizer_functions.sql.in
index c89a91e95..65c3e919b 100644
--- a/extensions/address_standardizer/address_standardizer_functions.sql.in
+++ b/extensions/address_standardizer/address_standardizer_functions.sql.in
@@ -3,6 +3,17 @@
 -- Author: Stephen Woodbridge <woodbri at imaptools.com>
 ---------------------------------------------------------------------
 
+-- Availability: 3.4.0
+CREATE OR REPLACE FUNCTION debug_standardize_address(
+        lextab text,
+        gaztab text,
+        rultab text,
+        micro text,
+        macro text DEFAULT NULL )
+    RETURNS text
+    AS  'MODULE_PATHNAME', 'debug_standardize_address'
+    LANGUAGE 'c' IMMUTABLE COST 200;
+
 CREATE OR REPLACE FUNCTION standardize_address(
         lextab text,
         gaztab text,

-----------------------------------------------------------------------

Summary of changes:
 NEWS                                               |   1 +
 doc/extras_address_standardizer.xml                | 179 ++++++++++++--
 .../address_standardizer/address_standardizer.c    | 262 ++++++++++++++++++++-
 .../address_standardizer_functions.sql.in          |  11 +
 4 files changed, 426 insertions(+), 27 deletions(-)


hooks/post-receive
-- 
PostGIS


More information about the postgis-tickets mailing list