kNN Clustering - DOHMH New York City Restaurant Inspection Results

Find groups of different business names that might be alternative representations of the same venue. This is an example for the kNN clustering supported by openclean.

[1]:
# Open the downloaded dataset to extract the relevant columns and records.

import os

from openclean.pipeline import stream

df = stream(os.path.join('data', '43nn-pn8j.tsv.gz'))

Extract Relevant Records

Get set of distinct business names from DBA column.

[2]:
# Get distinct set of street names. By computing the distinct set of
# street names first we avoid computing keys for each distinct street
# name multiple times.

dba = df.select('DBA').distinct()

print('{} distinct bisiness names (for {} total values)'.format(len(dba), sum(dba.values())))
21046 distinct bisiness names (for 392131 total values)
[3]:
# Cluster business names using kNN clusterer (with the default n-gram setting)
# using the Levenshtein distance as the similarity measure.
# Remove clusters that contain less than ten distinct values (for display
# purposes).

from openclean.cluster.knn import knn_clusters
from openclean.function.similarity.base import SimilarityConstraint
from openclean.function.similarity.text import LevenshteinDistance
from openclean.function.value.threshold import GreaterThan

# Minimum cluster size. Use ten as default (to limit
# the number of clusters that are printed in the next cell).
minsize = 5

clusters = knn_clusters(
    values=dba,
    sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
    minsize=minsize
)

print('{} clusters of size {} or greater'.format(len(clusters), minsize))
17 clusters of size 5 or greater
[4]:
# For each cluster print cluster values, their frequency counts,
# and the suggested common value for the cluster.

def print_cluster(cnumber, cluster):
    print('Cluster {} (of size {})\n'.format(cnumber, len(cluster)))
    for val, count in cluster.items():
        print('{} ({})'.format(val, count))
    print('\nSuggested value: {}\n\n'.format(cluster.suggestion()))

# Sort clusters by decreasing number of distinct values.
clusters.sort(key=lambda c: len(c), reverse=True)

for i, cluster in enumerate(clusters):
    print_cluster(i + 1, cluster)

Cluster 1 (of size 11)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN', BASKINS ROBBINS (19)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN' & BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN BASKIN ROBINS (8)
DUNKIN' BASKIN ROBBINS (6)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 2 (of size 11)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN', BASKINS ROBBINS (19)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN' & BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN  / BASKIN ROBBINS (2)
DUNKIN'  BASKIN ROBBINS (7)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 3 (of size 11)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN' & BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN BASKIN ROBINS (8)
DUNKIN  / BASKIN ROBBINS (2)
DUNKIN  BASKIN ROBBINS (13)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 4 (of size 10)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN', BASKINS ROBBINS (19)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN BASKIN ROBINS (8)
DUNKIN, BASKIN ROBBINS (20)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 5 (of size 9)

DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN', BASKINS ROBBINS (19)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN' & BASKIN ROBBINS (13)
DUNKIN', BASKIN ROBBINS (1147)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 6 (of size 9)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN BASKIN ROBINS (8)
DUNKIN BASKIN ROBBINS (52)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 7 (of size 9)

DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN'/BASKIN ROBBINS (15)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN BASKIN ROBINS (8)
DUNKIN  / BASKIN ROBBINS (2)
DUNKIN /BASKIN ROBBINS (9)

Suggested value: DUNKIN BASKIN ROBBINS


Cluster 8 (of size 8)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN'/BASKIN ROBBINS (15)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 9 (of size 6)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN  / BASKIN ROBBINS (2)
DUNKIN' & BASKIN ROBBINS (13)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 10 (of size 6)

DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN BASKIN ROBINS (8)

Suggested value: DUNKIN BASKIN ROBBINS


Cluster 11 (of size 6)

CITI FIELD STAND 321 (6)
CITI FIELD STAND 121 (5)
CITI FIELD STAND 425 (6)
CITI FIELD STAND 431 (5)
CITI FIELD STAND 423 (6)
CITI FIELD STAND 421 (7)

Suggested value: CITI FIELD STAND 421


Cluster 12 (of size 6)

CITI FIELD STAND 335 (4)
CITI FIELD STAND 415 (5)
CITI FIELD STAND 425 (6)
CITI FIELD STAND 433 (4)
CITI FIELD STAND 431 (5)
CITI FIELD STAND 435 (4)

Suggested value: CITI FIELD STAND 425


Cluster 13 (of size 5)

CHIPOTLE MEXICAN GRILL #2308 (6)
CHIPOTLE MEXICAN GRILL #2834 (8)
CHIPOTLE MEXICAN GRILL #2918 (3)
CHIPOTLE MEXICAN GRILL #2879 (3)
CHIPOTLE MEXICAN GRILL #2838 (7)

Suggested value: CHIPOTLE MEXICAN GRILL #2834


Cluster 14 (of size 5)

DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN', BASKINS ROBBINS (19)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 15 (of size 5)

DUNKIN'  BASKIN ROBBINS (7)
DUNKIN  BASKIN ROBBINS (13)
DUNKIN' & BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN  / BASKIN ROBBINS (2)

Suggested value: DUNKIN  BASKIN ROBBINS


Cluster 16 (of size 5)

SERVICE BAR 6 (10)
SERVICE BAR 8 (5)
SERVICE BAR 5 (6)
SERVICE BAR 3 (5)
SERVICE BAR 7 (3)

Suggested value: SERVICE BAR 6


Cluster 17 (of size 5)

CITI FIELD STAND 421 (7)
CITI FIELD STAND 435 (4)
CITI FIELD STAND 415 (5)
CITI FIELD STAND 423 (6)
CITI FIELD STAND 425 (6)

Suggested value: CITI FIELD STAND 421


[5]:
# Perform normalization of business names first to get an
# initial set of clusters using key collision clustering.
# Then run kNN clustering on the collision keys.

from collections import Counter

from openclean.cluster.knn import knn_collision_clusters

clusters = knn_collision_clusters(
    values=dba,
    sim=SimilarityConstraint(func=LevenshteinDistance(), pred=GreaterThan(0.9)),
    minsize=minsize
)

print('{} clusters of size {} or greater'.format(len(clusters), minsize))
4 clusters of size 5 or greater
[6]:
# Print resulting clusters.

clusters.sort(key=lambda c: len(c), reverse=True)

for i, cluster in enumerate(clusters):
    print_cluster(i + 1, cluster)

Cluster 1 (of size 12)

DUNKIN', BASKINS ROBBINS (19)
DUNKIN BASKIN ROBINS (8)
DUNKIN', BASKIN ROBBINS (1147)
DUNKIN, BASKIN ROBBINS (20)
DUNKIN BASKIN ROBBINS (52)
DUNKIN' BASKIN ROBBINS (6)
DUNKIN'  BASKIN ROBBINS (7)
DUNKIN  BASKIN ROBBINS (13)
Dunkin/ Baskin Robbins (12)
DUNKIN' & BASKIN ROBBINS (13)
DUNKIN /BASKIN ROBBINS (9)
DUNKIN  / BASKIN ROBBINS (2)

Suggested value: DUNKIN', BASKIN ROBBINS


Cluster 2 (of size 7)

KENNEDY FRIED CHICKEN (1294)
Kennedy Fried Chicken (50)
Kennedy fried chicken (10)
KENNEDY  FRIED CHICKEN (8)
U.S KENNEDY FRIED CHICKEN (12)
US KENNEDY FRIED CHICKEN (27)
KENNEDY'S FRIED CHICKEN (16)

Suggested value: KENNEDY FRIED CHICKEN


Cluster 3 (of size 6)

CITI FIELD STAND 121 (5)
CITI FIELD STAND 321 (6)
CITI FIELD STAND 423 (6)
CITI FIELD STAND 425 (6)
CITI FIELD STAND 431 (5)
CITI FIELD STAND 421 (7)

Suggested value: CITI FIELD STAND 421


Cluster 4 (of size 6)

CITI FIELD STAND 335 (4)
CITI FIELD STAND 415 (5)
CITI FIELD STAND 425 (6)
CITI FIELD STAND 431 (5)
CITI FIELD STAND 433 (4)
CITI FIELD STAND 435 (4)

Suggested value: CITI FIELD STAND 425