Use GoldenMatch as Python UDFs inside DuckDB. Score strings, deduplicate tables, and run pipelines from SQL.
pip install goldenmatch-duckdb
Requires goldenmatch and duckdb as dependencies. Also needs pyarrow for Polars conversion (.pl()).
import duckdb
import goldenmatch_duckdb
con = duckdb.connect()
goldenmatch_duckdb.register(con)
# Score two strings
con.sql("SELECT goldenmatch_score('John Smith', 'Jon Smyth', 'jaro_winkler')").show()
# 0.884
Score two strings with a named scorer.
SELECT goldenmatch_score('John Smith', 'Jon Smyth', 'jaro_winkler');
-- 0.884
SELECT goldenmatch_score('hello', 'helo', 'levenshtein');
-- 0.8
SELECT goldenmatch_score('Smith', 'Smyth', 'soundex_match');
-- 1.0
Available scorers: jaro_winkler, levenshtein, exact, token_sort, soundex_match.
Deduplicate a DuckDB table.
SELECT * FROM goldenmatch_dedupe('customers', '{"exact": ["email"]}');
con.sql("""
SELECT * FROM goldenmatch_dedupe('customers', '{
"matchkeys": [{
"name": "fuzzy",
"type": "weighted",
"threshold": 0.85,
"fields": [
{"field": "name", "scorer": "jaro_winkler", "weight": 0.7},
{"field": "zip", "scorer": "exact", "weight": 0.3}
]
}]
}')
""").show()
Match a target table against a reference table.
SELECT * FROM goldenmatch_match('new_customers', 'master', '{"fuzzy": {"name": 0.85}}');
Score a pair of JSON records.
SELECT goldenmatch_score_pair(
'{"name": "John Smith", "zip": "10001"}',
'{"name": "Jon Smyth", "zip": "10001"}',
'{"fuzzy": {"name": 0.7, "zip": 0.3}}'
);
Explain a match decision in natural language.
SELECT goldenmatch_explain(
'{"name": "John Smith"}',
'{"name": "Jon Smyth"}',
'{"fuzzy": {"name": 1.0}}'
);
-- Run AutoConfigController on a table; get committed config JSON.
SELECT goldenmatch_autoconfig('customers');
-- Same call, but returns the controller telemetry blob (stop_reason,
-- health, decisions, indicator priors, committed NE).
SELECT goldenmatch_autoconfig_telemetry('customers');
-- Run dedupe with a *full* GoldenMatchConfig JSON. Unlike the slim
-- `goldenmatch_dedupe_table` kwargs, this accepts `negative_evidence`
-- (Path Y) and every other Pydantic field.
SELECT goldenmatch_dedupe_full('customers', '{
"matchkeys": [
{"name": "exact_email", "type": "exact",
"fields": [{"field": "email", "transforms": ["lowercase"]}],
"negative_evidence": [
{"field": "phone", "scorer": "exact",
"transforms": ["digits_only"], "threshold": 0.5, "penalty": 0.5}
]}
]
}');
-- Job pipeline: gm_run captures controller telemetry into in-memory state.
SELECT gm_configure('cust_job', '{"exact": ["email"]}');
SELECT gm_run('cust_job', 'customers');
SELECT gm_telemetry('cust_job'); -- last run's telemetry JSON
The telemetry JSON shape is identical across the DuckDB UDFs, the Postgres extension, the CLI goldenmatch autoconfig, and the web /api/v1/controller/telemetry endpoint — parse once, reuse everywhere.
-- List completed jobs
SELECT * FROM goldenmatch_list_jobs();
-- Get clusters from a job
SELECT * FROM goldenmatch_get_clusters('job_id');
-- Get golden records
SELECT * FROM goldenmatch_get_golden('job_id');
-- Get scored pairs
SELECT * FROM goldenmatch_get_pairs('job_id');
-- Get job status
SELECT goldenmatch_job_status('job_id');
-- Version
SELECT goldenmatch_version();
-- Available scorers
SELECT * FROM goldenmatch_list_scorers();
import duckdb
import goldenmatch_duckdb
con = duckdb.connect()
goldenmatch_duckdb.register(con)
con.sql("""
CREATE TABLE customers AS
SELECT * FROM read_csv_auto('customers.csv');
SELECT
a.name AS name_a,
b.name AS name_b,
goldenmatch_score(a.name, b.name, 'jaro_winkler') AS similarity
FROM customers a, customers b
WHERE a.rowid < b.rowid
AND goldenmatch_score(a.name, b.name, 'jaro_winkler') > 0.85;
""").show()
con.sql("""
CREATE TABLE customers AS SELECT * FROM read_csv_auto('customers.csv');
""")
result = con.sql("SELECT * FROM goldenmatch_dedupe('customers', '{\"exact\": [\"email\"]}')").pl()
result.write_csv("deduped.csv")
The dbt-goldenmatch package provides macros for DuckDB-based entity resolution:
pip install dbt-goldenmatch
-- In a dbt model
{{ run_goldenmatch_dedupe('customers', '{"exact": ["email"]}') }}
GoldenMatch also supports DuckDB as an out-of-core processing backend for datasets that don’t fit in memory:
import goldenmatch as gm
result = gm.dedupe("huge.csv", exact=["email"], backend="duckdb")
The DuckDB backend provides:
from goldenmatch.backends.duckdb_backend import DuckDBBackend
backend = DuckDBBackend("data.duckdb")
backend.write_table("customers", df)
df = backend.read_table("customers")
tables = backend.list_tables()
con.cursor() for internal table reads.pl() (Polars conversion) requires pyarrow as a dependency