Token Signature Outliers for Street Names

Find street names that do not include at least one token from token signature that represents street names in U.S. address columns. Uses the NYC Parking Violations Issued - Fiscal Year 2014 dataset.

[1]:
# Download the full 'DOB Job Application Fiings' dataset.

import gzip
import os

from openclean.data.source.socrata import Socrata

datafile = './jt7v-77mi.tsv.gz'

# Download file only if it does not exist already.
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        ds = Socrata().dataset('jt7v-77mi')
        print('Downloading ...\n')
        print(ds.name + '\n')
        print(ds.description)
        ds.write(f)


# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'

# Setup the environment for this demo. All downloaded reference
# data files will be stored in a subfolder refdata_tmp.

from refdata.config import ENV_BASEDIR

os.environ[ENV_BASEDIR] = './refdata_tmp'
[2]:
# Download the street abbreviation reference dataset.

import openclean.data.refdata as refdata

refdata.download('usps:street_abbrev')
[3]:
# Use streaming function to avoid having to load the full dataset
# into memory.

from openclean.pipeline import stream

df = stream(datafile)
[4]:
# Get distinct set of street names. By computing the distinct set of
# street names first we avoid computing keys for each distinct street
# name multiple times.

streets = df.select('Street').distinct()

print('{} distinct street names'.format(len(streets)))
115567 distinct street names
[5]:
# Create a token signature from the street abbreviations.

from openclean.operator.map.groupby import groupby
from openclean.operator.transform.apply import apply
from openclean.profiling.pattern.token_signature import token_signature

# Convert all values to liwer case.
street_abbrev = refdata.load('usps:street_abbrev').df()
street_abbrev = apply(street_abbrev, columns=street_abbrev.columns, func=str.lower)
# Create one signature entry for each unique primary suffix.
groups = groupby(street_abbrev, columns='primary_suffix')
signature = token_signature(groups, columns=list(street_abbrev.columns))
[6]:
# Print the token signature.
signature
[6]:
[{'shl', 'shoal'},
 {'tunel', 'tunl', 'tunls', 'tunnel', 'tunnels', 'tunnl'},
 {'rest', 'rst'},
 {'ports', 'prts'},
 {'extensions', 'exts'},
 {'run'},
 {'clfs', 'cliffs'},
 {'est', 'estate'},
 {'pine', 'pne'},
 {'courts', 'cts'},
 {'cen', 'cent', 'center', 'centr', 'centre', 'cnter', 'cntr', 'ctr'},
 {'heights', 'ht', 'hts'},
 {'lf', 'loaf'},
 {'vill', 'villag', 'village', 'villg', 'villiage', 'vlg'},
 {'ter', 'terr', 'terrace'},
 {'vis', 'vist', 'vista', 'vst', 'vsta'},
 {'motorway', 'mtwy'},
 {'view', 'vw'},
 {'keys', 'kys'},
 {'key', 'ky'},
 {'shoar', 'shore', 'shr'},
 {'crest', 'crst'},
 {'plaza', 'plz', 'plza'},
 {'creek', 'crk'},
 {'hill', 'hl'},
 {'stra', 'strav', 'straven', 'stravenue', 'stravn', 'strvn', 'strvnue'},
 {'anex', 'annex', 'annx', 'anx'},
 {'rdg', 'rdge', 'ridge'},
 {'manors', 'mnrs'},
 {'drives', 'drs'},
 {'jctns', 'jcts', 'junctions'},
 {'forks', 'frks'},
 {'villages', 'vlgs'},
 {'lake', 'lk'},
 {'brks', 'brooks'},
 {'knls', 'knolls'},
 {'smt', 'sumit', 'sumitt', 'summit'},
 {'brdge', 'brg', 'bridge'},
 {'hills', 'hls'},
 {'riv', 'river', 'rivr', 'rvr'},
 {'av', 'ave', 'aven', 'avenu', 'avenue', 'avn', 'avnue'},
 {'ranch', 'ranches', 'rnch', 'rnchs'},
 {'rapid', 'rpd'},
 {'vdct', 'via', 'viadct', 'viaduct'},
 {'course', 'crse'},
 {'curv', 'curve'},
 {'grov', 'grove', 'grv'},
 {'points', 'pts'},
 {'cir', 'circ', 'circl', 'circle', 'crcl', 'crcle'},
 {'crossing', 'crssng', 'xing'},
 {'blf', 'bluf', 'bluff'},
 {'blfs', 'bluffs'},
 {'mnt', 'mount', 'mt'},
 {'rad', 'radial', 'radiel', 'radl'},
 {'field', 'fld'},
 {'shoars', 'shores', 'shrs'},
 {'parkway', 'parkwy', 'pkway', 'pkwy', 'pky'},
 {'valley', 'vally', 'vlly', 'vly'},
 {'forg', 'forge', 'frg'},
 {'camp', 'cmp', 'cp'},
 {'loop', 'loops'},
 {'exp', 'expr', 'express', 'expressway', 'expw', 'expy'},
 {'is', 'island', 'islnd'},
 {'track', 'tracks', 'trak', 'trk', 'trks'},
 {'unions', 'uns'},
 {'ldg', 'ldge', 'lodg', 'lodge'},
 {'manor', 'mnr'},
 {'forges', 'frgs'},
 {'haven', 'hvn'},
 {'mntain', 'mntn', 'mountain', 'mountin', 'mtin', 'mtn'},
 {'blvd', 'boul', 'boulevard', 'boulv'},
 {'lane', 'ln'},
 {'bend', 'bnd'},
 {'mdw', 'mdws', 'meadows', 'medows'},
 {'wells', 'wls'},
 {'underpass', 'upas'},
 {'rue'},
 {'coves', 'cvs'},
 {'cres', 'crescent', 'crsent', 'crsnt'},
 {'trafficway', 'trfy'},
 {'valleys', 'vlys'},
 {'row'},
 {'mews'},
 {'clf', 'cliff'},
 {'rds', 'roads'},
 {'stream', 'streme', 'strm'},
 {'groves', 'grvs'},
 {'mill', 'ml'},
 {'crossroad', 'xrd'},
 {'corners', 'cors'},
 {'lakes', 'lks'},
 {'nck', 'neck'},
 {'streets', 'sts'},
 {'wall'},
 {'orch', 'orchard', 'orchrd'},
 {'knl', 'knol', 'knoll'},
 {'spur'},
 {'sq', 'sqr', 'sqre', 'squ', 'square'},
 {'allee', 'alley', 'ally', 'aly'},
 {'glens', 'glns'},
 {'ville', 'vl'},
 {'st', 'str', 'street', 'strt'},
 {'bgs', 'burgs'},
 {'court', 'ct'},
 {'trailer', 'trlr', 'trlrs'},
 {'mall'},
 {'fall'},
 {'mntns', 'mountains', 'mtns'},
 {'fort', 'frt', 'ft'},
 {'cape', 'cpe'},
 {'ext', 'extension', 'extn', 'extnsn'},
 {'lgt', 'light'},
 {'well', 'wl'},
 {'harb', 'harbor', 'harbr', 'hbr', 'hrbor'},
 {'hllw', 'hollow', 'hollows', 'holw', 'holws'},
 {'centers', 'ctrs'},
 {'bch', 'beach'},
 {'pr', 'prairie', 'prr'},
 {'cor', 'corner'},
 {'un', 'union'},
 {'pass'},
 {'dr', 'driv', 'drive', 'drv'},
 {'point', 'pt'},
 {'pike', 'pikes'},
 {'walk', 'walks'},
 {'brk', 'brook'},
 {'mdw', 'meadow'},
 {'causeway', 'causwa', 'cswy'},
 {'sta', 'station', 'statn', 'stn'},
 {'lck', 'lock'},
 {'canyn', 'canyon', 'cnyn', 'cyn'},
 {'div', 'divide', 'dv', 'dvd'},
 {'port', 'prt'},
 {'harbors', 'hbrs'},
 {'forest', 'forests', 'frst'},
 {'views', 'vws'},
 {'jct', 'jction', 'jctn', 'junction', 'junctn', 'juncton'},
 {'park', 'prk'},
 {'freeway', 'freewy', 'frway', 'frwy', 'fwy'},
 {'dam', 'dm'},
 {'cmn', 'common'},
 {'parkways', 'pkwy', 'pkwys'},
 {'byp', 'bypa', 'bypas', 'bypass', 'byps'},
 {'fork', 'frk'},
 {'trail', 'trails', 'trl', 'trls'},
 {'skwy', 'skyway'},
 {'bayoo', 'bayou', 'byu'},
 {'arc', 'arcade'},
 {'cove', 'cv'},
 {'shls', 'shoals'},
 {'passage', 'psge'},
 {'throughway', 'trwy'},
 {'oval', 'ovl'},
 {'plain', 'pln'},
 {'walk'},
 {'land'},
 {'flats', 'flts'},
 {'estates', 'ests'},
 {'pl', 'place'},
 {'bg', 'burg'},
 {'highway', 'highwy', 'hiway', 'hiwy', 'hway', 'hwy'},
 {'rd', 'road'},
 {'plains', 'plns'},
 {'trace', 'traces', 'trce'},
 {'park', 'parks'},
 {'glen', 'gln'},
 {'green', 'grn'},
 {'inlet', 'inlt'},
 {'spur', 'spurs'},
 {'ferry', 'frry', 'fry'},
 {'pines', 'pnes'},
 {'garden', 'gardn', 'gdn', 'grden', 'grdn'},
 {'clb', 'club'},
 {'islands', 'islnds', 'iss'},
 {'route', 'rte'},
 {'rdgs', 'ridges'},
 {'falls', 'fls'},
 {'spg', 'spng', 'spring', 'sprng'},
 {'gardens', 'gdns', 'grdns'},
 {'isle', 'isles'},
 {'landing', 'lndg', 'lndng'},
 {'cmns', 'commons'},
 {'spgs', 'spngs', 'springs', 'sprngs'},
 {'mission', 'missn', 'msn', 'mssn'},
 {'circles', 'cirs'},
 {'fields', 'flds'},
 {'ramp'},
 {'lcks', 'locks'},
 {'path', 'paths'},
 {'crossroads', 'xrds'},
 {'bot', 'bottm', 'bottom', 'btm'},
 {'way', 'wy'},
 {'greens', 'grns'},
 {'ways'},
 {'fords', 'frds'},
 {'br', 'branch', 'brnch'},
 {'dale', 'dl'},
 {'lgts', 'lights'},
 {'mills', 'mls'},
 {'flat', 'flt'},
 {'sqrs', 'sqs', 'squares'},
 {'rapids', 'rpds'},
 {'opas', 'overpass'},
 {'gateway', 'gatewy', 'gatway', 'gtway', 'gtwy'},
 {'tpke', 'trnpk', 'turnpike', 'turnpk'},
 {'ford', 'frd'}]
[7]:
# Identify outliers in the set of street names that do not match
# at least one entry in the signature.

from openclean.profiling.anomalies.pattern import TokenSignatureOutliers

outliers = TokenSignatureOutliers(signature=signature).process(streets)

print('found {} outliers in list of {} street names'.format(len(outliers), len(streets)))
found 27250 outliers in list of 115567 street names
[8]:
# Print sample of 100 values from discovered outliers.

from random import Random

for val in sorted(Random(41).choices(outliers, k=100)):
    print(val)
100  C/N OF SCHERMER
100FT PARKING LOT OF
137 STRET
15 FEET EAST OF 5TH
2  S/W C/O DAHILL
37 AVENUE+
53STREET
ATLANTIC AVE3
ATLANTICA
B 121
BENTON
BLDG 72 JFK AIRPORT
BRIGHTON 14
BRONX P EAST
C/O 64 CIRLCE
C/O B 60
C/O E 167 T
C/O E 21
C/O HURON
C/O LENEVAR
C/O W 127
C/O W 188ST
C/O WYCOFF
CNETRAL PK WEST
COLUMBUS NYCH
COMMONWEALTH BL
E 108
E 174ST
E/S 32 P
FIB
FLUSHING MUPPY LOT
FORSYTH STQ
GROTE
HANGAR 1 - 16 A
HERRING
HICKS
I/O CARROLL
INSIDE 188 LINCOLN A
INSIDE CONNISNSHON P
INSIDE FMCP INSIDE
IO E 135
LYDIG
MAC KAY
MANHATTAN AVW
MMPW
N 6
N/B FRANCIS LEWIS BL
N/E C/O E 88
N/E C/O FORSYTHE
N/E C/O YORK
N/E CRNR BAYRIDGE PK
N/S C/R OF MILFORD S
N/W C/O E 35
N/W W 239 STRET
N/W/C JAMAICA
NORDSTRAN
ODELL
OPP 104
P K LOT 566 HAMILTON
PARKING LOT #7
PIKTIN
Q 83 BUS OF
Q4 BUS STOP S/S ARCH
R/O 1455 HARROD
R/O 245-10 FRANCIS L
R/O 2946
R/SIDE BUILDING OF 3
REAR OF 131
REAR OF 48 MONUMENT
REAR OF EVANS
REV JMS PLT
S/B WEBSTER 450  FRO
S/E ARCHER 50FT
S/E C/O RIVINGTON
S/E GOUVERNEUR
S/O 81
S/S HAWKIN ATTORNEY
S/S SKILLMAN 2
S/W C/O 124 ST6
SIDE OF 456 E 3 STRE
ST.GEORGE LOT #1
T 5 ARRIVALS JFK
TERMINAL B
TERMINAL B-LGA
TOMPKINS3
VASE
W 170 TH
W 22ST
W 41 S
W 87
W 90TH
W/O DAVISON SOUTH
W/S 2 AVEENUE
W/S EDGECOMBE AEV
WATERS EDGE D
WATT
WAVE
WEST 39TH
WEST 44TH
WILLLIAMSBRIDGE