[postgis-tickets] [SCM] PostGIS branch master updated. 3.3.0rc2-965-g58cd6af22
git at osgeo.org
git at osgeo.org
Mon Jun 12 08:01:20 PDT 2023
This is an automated email from the git hooks/post-receive script. It was
generated because a ref change was pushed to the repository containing
the project "PostGIS".
The branch, master has been updated
via 58cd6af22685f3c250d7c7f7bf458e528dae58ca (commit)
from 06bed87f309bc0c207b5d0ca22680d368c7347ed (commit)
Those revisions listed above that are new to this repository have
not appeared on any other notification email; so we list those
revisions in full, below.
- Log -----------------------------------------------------------------
commit 58cd6af22685f3c250d7c7f7bf458e528dae58ca
Author: Regina Obe <lr at pcorp.us>
Date: Mon Jun 12 10:24:48 2023 -0400
Cleanup
References #5397
- Get rid of unused variable and declaration warnings
- Revise documentation to show how you can expand the json document
diff --git a/doc/extras_address_standardizer.xml b/doc/extras_address_standardizer.xml
index 9104eac04..6e638e97e 100644
--- a/doc/extras_address_standardizer.xml
+++ b/doc/extras_address_standardizer.xml
@@ -594,6 +594,29 @@ into includes in the future for easier maintenance.</para></listitem>
<para>For single line addresses use just <varname>micro</varname></para>
<para>For two line address A <varname>micro</varname> consisting of standard first line of postal address e.g. <code>house_num street</code>, and a macro consisting of standard postal second line of an address e.g <code>city, state postal_code country</code>.</para>
+ <para>Elements returned in the json document are </para>
+ <variablelist>
+ <varlistentry>
+ <term>input_tokens</term>
+ <listitem>
+ <para>For each word in the input address, returns the position of the word,
+ token categorization of the word, and the standard word it is mapped too.
+ Note that for some input words, you might get back multiple records because some inputs can be categorized
+ as more than one thing. </para>
+ </listitem>
+ </varlistentry>
+ <varlistentry><term>rules</term>
+ <listitem>
+ <para>The set of rules matching the input and the corresponding score for each. The first rule (highest scoring) is
+ what is used for standardization</para>
+ </listitem>
+ </varlistentry>
+ <varlistentry><term>stdaddr</term>
+ <listitem>
+ <para>The standardized address elements that would be returned when running <xref id="standardize_address" /></para>
+ </listitem>
+ </varlistentry>
+ </variablelist>
<!-- use this format if new function -->
<para>Availability: 3.4.0</para>
<para>&address_standardizer_required;</para>
@@ -605,82 +628,41 @@ into includes in the future for easier maintenance.</para></listitem>
<para>Using address_standardizer_data_us extension</para>
<programlisting>CREATE EXTENSION address_standardizer_data_us; -- only needs to be done once</programlisting>
- <para>Variant 1: Single line address. This doesn't work well with non-US addresses</para>
- <programlisting>SELECT s::jsonb
- FROM debug_standardize_address('us_lex',
- 'us_gaz', 'us_rules', 'One Devonshire Place, PH 301, Boston, MA 02109') AS s;</programlisting>
- <screen><![CDATA[{
- "macro": "Boston,MA,02109,US,",
- "micro": "One Devonshire Place, PH 301",
- "rules": [
- {
- "no": 0,
- "score": 0.87625,
- "raw_score": 0.87625,
- "rule_string": "0 1 2 17 0 -1 1 5 6 17 17",
- "rule_tokens": [
- {
- "pos": 0,
- "input-word": "ONE",
- "input-token": "NUMBER",
- "mapped-word": "1",
- "output-token": "HOUSE",
- "input-token-code": 0,
- "output-token-code": 1
- },
- {
- "pos": 1,
- "input-word": "DEVONSHIRE",
- "input-token": "WORD",
- "mapped-word": "DEVONSHIRE",
- "output-token": "STREET",
- "input-token-code": 1,
- "output-token-code": 5
- },
- :
- ]
- },
- :
- ]
- }
- ],
- "stdaddr": {
- "box": null,
- "city": "BOSTON",
- "name": "DEVONSHIRE",
- "qual": null,
- "unit": "# PENTHOUSE 301",
- "extra": null,
- "state": "MASSACHUSETTS",
- "predir": null,
- "sufdir": null,
- "country": "USA",
- "pretype": null,
- "suftype": "PLACE",
- "building": null,
- "postcode": "02109",
- "house_num": "1",
- "ruralroute": null
- },
- "input_tokens": [
- {
- "pos": 0,
- "word": "ONE",
- "token": "NUMBER",
- "stdword": "1",
- "token-code": 0
- },
- {
- "pos": 0,
- "word": "ONE",
- "token": "WORD",
- "stdword": "1",
- "token-code": 1
- },
- :
- ]
-}
-}]]></screen>
+ <para>Variant 1: Single line address and returning the input tokens</para>
+ <programlisting><![CDATA[SELECT it->>'pos' AS position, it->>'word' AS word, it->>'stdword' AS standardized_word,
+ it->>'token' AS token, it->>'token-code' AS token_code
+ FROM jsonb(
+ debug_standardize_address('us_lex',
+ 'us_gaz', 'us_rules', 'One Devonshire Place, PH 301, Boston, MA 02109')
+ ) AS s, jsonb_array_elements(s->'input_tokens') AS it;]]></programlisting>
+ <screen>position | word | standardized_word | token | token_code
+----------+------------+-------------------+--------+------------
+ 0 | ONE | 1 | NUMBER | 0
+ 0 | ONE | 1 | WORD | 1
+ 1 | DEVONSHIRE | DEVONSHIRE | WORD | 1
+ 2 | PLACE | PLACE | TYPE | 2
+ 3 | PH | PATH | TYPE | 2
+ 3 | PH | PENTHOUSE | UNITT | 17
+ 4 | 301 | 301 | NUMBER | 0
+(7 rows)</screen>
+
+ <para>Variant 2: Multi line address and returning first rule input mappings and score</para>
+ <programlisting><![CDATA[SELECT (s->'rules'->0->>'score')::numeric AS score, it->>'pos' AS position,
+ it->>'input-word' AS word, it->>'input-token' AS input_token, it->>'mapped-word' AS standardized_word,
+ it->>'output-token' AS output_token
+ FROM jsonb(
+ debug_standardize_address('us_lex',
+ 'us_gaz', 'us_rules', 'One Devonshire Place, PH 301', 'Boston, MA 02109')
+ ) AS s, jsonb_array_elements(s->'rules'->0->'rule_tokens') AS it;]]></programlisting>
+ <screen> score | position | word | input_token | standardized_word | output_token
+----------+----------+------------+-------------+-------------------+--------------
+ 0.876250 | 0 | ONE | NUMBER | 1 | HOUSE
+ 0.876250 | 1 | DEVONSHIRE | WORD | DEVONSHIRE | STREET
+ 0.876250 | 2 | PLACE | TYPE | PLACE | SUFTYP
+ 0.876250 | 3 | PH | UNITT | PENTHOUSE | UNITT
+ 0.876250 | 4 | 301 | NUMBER | 301 | UNITT
+(5 rows)
+</screen>
</refsection>
diff --git a/extensions/address_standardizer/address_standardizer.c b/extensions/address_standardizer/address_standardizer.c
index b4375dfdf..2a13d6406 100644
--- a/extensions/address_standardizer/address_standardizer.c
+++ b/extensions/address_standardizer/address_standardizer.c
@@ -63,15 +63,19 @@ debug_standardize_address(PG_FUNCTION_ARGS)
char *rultab;
char *micro;
char *macro;
- //Datum result;
STDADDR *stdaddr;
- char **values;
int k;
char rule_in[100];
char rule_out[100];
char temp[10];
int stz_no , n ;
- //SEG *rseg;
+ DEF *__def__ ;
+ STZ **__stz_list__;
+ STAND_PARAM *ms;
+ STZ_PARAM *__stz_info__ ;
+ int lex_pos;
+ int started;
+ STZ *__cur_stz__;
StringInfo result = makeStringInfo();
elog(DEBUG2, "Start %s", __func__);
@@ -90,8 +94,8 @@ debug_standardize_address(PG_FUNCTION_ARGS)
else {
ADDRESS *paddr;
HHash *stH;
- stH = (HHash *) palloc0(sizeof(HHash));
int err;
+ stH = (HHash *) palloc0(sizeof(HHash));
if (!stH) {
elog(ERROR, "%s: Failed to allocate memory for hash!", __func__);
return -1;
@@ -151,21 +155,16 @@ debug_standardize_address(PG_FUNCTION_ARGS)
if (!std)
elog(ERROR, "%s failed to create the address standardizer object!", __func__);
- //output_rule_statistics( std->pagc_p->rules, std->err_p );
- STAND_PARAM *ms = std->misc_stand;
elog(DEBUG2, "%s: calling std_standardize_mm('%s', '%s')", __func__ , micro, macro);
stdaddr = std_standardize_mm( std, micro, macro, 0 );
- elog(DEBUG2, "%s back from fetch_stdaddr", __func__);
- DEF *__def__ ;
- STZ **__stz_list__;
-
- STZ_PARAM *__stz_info__ = ms->stz_info ;
+ ms = std->misc_stand;
+ __stz_info__ = ms->stz_info ;
+ elog(DEBUG2, "%s back from fetch_stdaddr", __func__);
- int lex_pos;
elog(DEBUG2, "Input tokenization candidates:\n");
appendStringInfoString(result, "\"input_tokens\":[");
- int started = 0;
+ started = 0;
for (lex_pos = FIRST_LEX_POS;lex_pos < ms->LexNum;lex_pos ++)
{
@@ -202,9 +201,7 @@ debug_standardize_address(PG_FUNCTION_ARGS)
strcpy(rule_in, "");
strcpy(rule_out, "");
- STZ *__cur_stz__ = __stz_list__[stz_no] ;
-
- KW *ruleref = __cur_stz__->build_key;
+ __cur_stz__ = __stz_list__[stz_no] ;
elog( DEBUG2, "Raw standardization %d with score %f:\n" , ( stz_no ) , __cur_stz__->score ) ;
appendStringInfo(result, "{\"score\": %f,", __cur_stz__->score);
appendStringInfo(result, "\"raw_score\": %f,", __cur_stz__->raw_score);
-----------------------------------------------------------------------
Summary of changes:
doc/extras_address_standardizer.xml | 134 +++++++++------------
.../address_standardizer/address_standardizer.c | 29 ++---
2 files changed, 71 insertions(+), 92 deletions(-)
hooks/post-receive
--
PostGIS
More information about the postgis-tickets
mailing list