Functional Dependency Violations
Example showing how to detect functional dependency violations. Uses the NYC Parking Violations Issued - Fiscal Year 2014 dataset to identify violations of the functional dependency Meter Number -> Registration State, Street
.
[1]:
# Download the full 'DOB Job Application Fiings' dataset.
# Note that the fill is over 300MB in size.
import gzip
import os
from openclean.data.source.socrata import Socrata
datafile = './jt7v-77mi.tsv.gz'
# Download file only if it does not exist already.
if not os.path.isfile(datafile):
with gzip.open(datafile, 'wb') as f:
ds = Socrata().dataset('jt7v-77mi')
print('Downloading ...\n')
print(ds.name + '\n')
print(ds.description)
ds.write(f)
# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'
[2]:
# Verify that the download was successful. Print dataset columns and number of rows.
# This example makes use of the streaming option to avoid loading the full data frame
# into memory.
from openclean.pipeline import stream
ds = stream(datafile)
print('Schema\n------')
for col in ds.columns:
print(" '{}'".format(col))
print('\n{} rows.'.format(ds.count()))
Schema
------
'Summons Number'
'Plate ID'
'Registration State'
'Plate Type'
'Issue Date'
'Violation Code'
'Vehicle Body Type'
'Vehicle Make'
'Issuing Agency'
'Street Code1'
'Street Code2'
'Street Code3'
'Vehicle Expiration Date'
'Violation Location'
'Violation Precinct'
'Issuer Precinct'
'Issuer Code'
'Issuer Command'
'Issuer Squad'
'Violation Time'
'Time First Observed'
'Violation County'
'Violation In Front Of Or Opposite'
'Number'
'Street'
'Intersecting Street'
'Date First Observed'
'Law Section'
'Sub Division'
'Violation Legal Code'
'Days Parking In Effect '
'From Hours In Effect'
'To Hours In Effect'
'Vehicle Color'
'Unregistered Vehicle?'
'Vehicle Year'
'Meter Number'
'Feet From Curb'
'Violation Post Code'
'Violation Description'
'No Standing or Stopping Violation'
'Hydrant Violation'
'Double Parking Violation'
9100278 rows.
[3]:
# Profile a sample of 1000 rows using the default profiler.
profiles = ds.sample(n=1000, random_state=42).profile()
profiles
[3]:
[{'column': 'Summons Number',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 1286655493, 'maximum': 8004852361}}}},
{'column': 'Plate ID',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 982, 'int': 18})}),
'minmaxValues': {'str': {'minimum': '019KWM', 'maximum': 'ZFF73E'},
'int': {'minimum': 22, 'maximum': 8184138}}}},
{'column': 'Registration State',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 998, 'int': 2})}),
'minmaxValues': {'str': {'minimum': 'AZ', 'maximum': 'VT'},
'int': {'minimum': 99, 'maximum': 99}}}},
{'column': 'Plate Type',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 994, 'int': 6})}),
'minmaxValues': {'str': {'minimum': 'APP', 'maximum': 'TRC'},
'int': {'minimum': 999, 'maximum': 999}}}},
{'column': 'Issue Date',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'date': 1000})}),
'minmaxValues': {'date': {'minimum': datetime.datetime(2012, 12, 10, 0, 0),
'maximum': datetime.datetime(2014, 6, 25, 0, 0)}}}},
{'column': 'Violation Code',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 5, 'maximum': 99}}}},
{'column': 'Vehicle Body Type',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 7,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 993})}),
'minmaxValues': {'str': {'minimum': '2 DR', 'maximum': 'VAN'}}}},
{'column': 'Vehicle Make',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 9,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 991})}),
'minmaxValues': {'str': {'minimum': 'ACURA', 'maximum': 'WORKH'}}}},
{'column': 'Issuing Agency',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 1000})}),
'minmaxValues': {'str': {'minimum': 'P', 'maximum': 'X'}}}},
{'column': 'Street Code1',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 95030}}}},
{'column': 'Street Code2',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 97740}}}},
{'column': 'Street Code3',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 98260}}}},
{'column': 'Vehicle Expiration Date',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 88888888}}}},
{'column': 'Violation Location',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 72,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 928})}),
'minmaxValues': {'int': {'minimum': 1, 'maximum': 122}}}},
{'column': 'Violation Precinct',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 122}}}},
{'column': 'Issuer Precinct',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 977}}}},
{'column': 'Issuer Code',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 981230}}}},
{'column': 'Issuer Command',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 69,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 797, 'int': 134})}),
'minmaxValues': {'str': {'minimum': '0NIC', 'maximum': 'T803'},
'int': {'minimum': 0, 'maximum': 977}}}},
{'column': 'Issuer Squad',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 69,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 779, 'int': 152})}),
'minmaxValues': {'str': {'minimum': 'A', 'maximum': 'YA'},
'int': {'minimum': 0, 'maximum': 0}}}},
{'column': 'Violation Time',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 1,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 999})}),
'minmaxValues': {'str': {'minimum': '0014A', 'maximum': '1259P'}}}},
{'column': 'Time First Observed',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 896,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 104})}),
'minmaxValues': {'str': {'minimum': '0107P', 'maximum': '1257P'}}}},
{'column': 'Violation County',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 72,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 928})}),
'minmaxValues': {'str': {'minimum': 'BX', 'maximum': 'R'}}}},
{'column': 'Violation In Front Of Or Opposite',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 87,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 913})}),
'minmaxValues': {'str': {'minimum': 'F', 'maximum': 'X'}}}},
{'column': 'Number',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 98,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 279, 'int': 623})}),
'minmaxValues': {'str': {'minimum': '10-16', 'maximum': 'W'},
'int': {'minimum': 1, 'maximum': 16305}}}},
{'column': 'Street',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 1,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 999})}),
'minmaxValues': {'str': {'minimum': '101st St', 'maximum': 'Wythe Ave'}}}},
{'column': 'Intersecting Street',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 798,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 202})}),
'minmaxValues': {'str': {'minimum': ')@ALLERTON AVE-XA-2',
'maximum': 'Y SVC RD @ 72ND RD'}}}},
{'column': 'Date First Observed',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 20140618}}}},
{'column': 'Law Section',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 408, 'maximum': 1180}}}},
{'column': 'Sub Division',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 998, 'int': 2})}),
'minmaxValues': {'str': {'minimum': 'B', 'maximum': 'm5'},
'int': {'minimum': 5, 'maximum': 99}}}},
{'column': 'Violation Legal Code',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 930,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 69, 'int': 1})}),
'minmaxValues': {'str': {'minimum': 'T', 'maximum': 'T'},
'int': {'minimum': 0, 'maximum': 0}}}},
{'column': 'Days Parking In Effect ',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 155,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 845})}),
'minmaxValues': {'str': {'minimum': 'BBBBBBB', 'maximum': 'YYYYYYY'}}}},
{'column': 'From Hours In Effect',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 366,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 634})}),
'minmaxValues': {'str': {'minimum': '0 :', 'maximum': 'ALL'}}}},
{'column': 'To Hours In Effect',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 366,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 634})}),
'minmaxValues': {'str': {'minimum': '0 :', 'maximum': 'ALL'}}}},
{'column': 'Vehicle Color',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 11,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 989})}),
'minmaxValues': {'str': {'minimum': 'BK', 'maximum': 'YW'}}}},
{'column': 'Unregistered Vehicle?',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 848,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 152})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 0}}}},
{'column': 'Vehicle Year',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 2015}}}},
{'column': 'Meter Number',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 751,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 249})}),
'minmaxValues': {'str': {'minimum': '-', 'maximum': '495-0067'}}}},
{'column': 'Feet From Curb',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 0,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'int': 1000})}),
'minmaxValues': {'int': {'minimum': 0, 'maximum': 10}}}},
{'column': 'Violation Post Code',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 220,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 743, 'int': 37})}),
'minmaxValues': {'str': {'minimum': '01 -', 'maximum': 'Y 99'},
'int': {'minimum': 0, 'maximum': 131}}}},
{'column': 'Violation Description',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 152,
'datatypes': defaultdict(collections.Counter,
{'total': Counter({'str': 848})}),
'minmaxValues': {'str': {'minimum': '09-Blocking the Box',
'maximum': 'PHTO SCHOOL ZN SPEED VIOLATION'}}}},
{'column': 'No Standing or Stopping Violation',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 1000,
'datatypes': defaultdict(collections.Counter, {}),
'minmaxValues': {}}},
{'column': 'Hydrant Violation',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 1000,
'datatypes': defaultdict(collections.Counter, {}),
'minmaxValues': {}}},
{'column': 'Double Parking Violation',
'stats': {'totalValueCount': 1000,
'emptyValueCount': 1000,
'datatypes': defaultdict(collections.Counter, {}),
'minmaxValues': {}}}]
[4]:
# Print number of empty cells for each column
profiles.stats()['empty']
[4]:
Summons Number 0
Plate ID 0
Registration State 0
Plate Type 0
Issue Date 0
Violation Code 0
Vehicle Body Type 7
Vehicle Make 9
Issuing Agency 0
Street Code1 0
Street Code2 0
Street Code3 0
Vehicle Expiration Date 0
Violation Location 72
Violation Precinct 0
Issuer Precinct 0
Issuer Code 0
Issuer Command 69
Issuer Squad 69
Violation Time 1
Time First Observed 896
Violation County 72
Violation In Front Of Or Opposite 87
Number 98
Street 1
Intersecting Street 798
Date First Observed 0
Law Section 0
Sub Division 0
Violation Legal Code 930
Days Parking In Effect 155
From Hours In Effect 366
To Hours In Effect 366
Vehicle Color 11
Unregistered Vehicle? 848
Vehicle Year 0
Meter Number 751
Feet From Curb 0
Violation Post Code 220
Violation Description 152
No Standing or Stopping Violation 1000
Hydrant Violation 1000
Double Parking Violation 1000
Name: empty, dtype: int64
[5]:
# Show minimum and maximum value for column 'Meter Number'. We see
# that the column not only contains a lot of emoty values but also
# '-' as an alternative representation for a missing value.
profiles.minmax('Meter Number')
[5]:
min | max | |
---|---|---|
str | - | 495-0067 |
[6]:
# Get the first 1000 rows. Ignore rows where the meter number is undefined (i.e., either
# an empty string or '-'). Convert the result into a data frame.
from openclean.function.eval.domain import IsNotIn
# We only select a subset of columns for this demo.
columns = [
'Plate ID',
'Registration State',
'Plate Type',
'Meter Number',
'Street',
'Vehicle Body Type',
'Vehicle Make',
'Vehicle Color'
]
df = ds\
.select(columns)\
.where(IsNotIn('Meter Number', set({'-', ''})), limit=1000)\
.to_df()
df.head()
[6]:
Plate ID | Registration State | Plate Type | Meter Number | Street | Vehicle Body Type | Vehicle Make | Vehicle Color | |
---|---|---|---|---|---|---|---|---|
661 | FXY1858 | NY | PAS | 407-3018 | QUEENS BLVD | SDN | NISSA | GY |
780 | 89988JX | NY | COM | 3 - | FRESH POND TRD | VAN | FORD | WHITE |
901 | FGX2747 | NY | PAS | 504-3043 | SDN | HONDA | SILVE | |
2287 | 23161JR | NY | COM | 144-3942 | WEST 42 STREET | P-U | FORD | WHITE |
2346 | 47153MC | NY | COM | 144-3987 | W 40TH ST | SDN | TOYOT | SILV |
[7]:
# Find violations of the functional dependency Meter Number -> Street.
from openclean.operator.map.violations import fd_violations
groups = fd_violations(df, lhs='Meter Number', rhs='Street')
[8]:
# List meter numbers that have violations and the number of
# violating values.
for key in groups:
print('{} {}'.format(key, groups.get(key).shape[0]))
144-3942 4
143-5293 4
144-6383 9
144-3937 9
143-3785 12
144-6376 5
144-6001 3
144-6089 3
143-3791 3
114-9979 2
144-3958 4
144-6453 3
144-3955 8
144-5988 2
144-6088 6
143-3786 2
144-6377 5
143-5983 6
140-5816 3
105-8347 2
140-5756 2
143-3793 2
117-5778 3
144-6047 2
140-6386 4
144-3957 2
143-3787 3
144-3959 2
140-5716 2
143-3767 2
140-9944 3
114-9970 2
105-8390 2
119-4780 3
120-8052 4
431-3003 2
143-5694 4
144-6601 2
301-3609 3
105-8346 2
140-9954 2
145-7412 2
103-4489 2
301-3678 2
143-3901 2
407-2167 2
201-3042 3
407-2206 2
201-3012 2
226-2760 2
[9]:
# Show street names that cause violations of the functional dependency.
from openclean.operator.collector.count import distinct
print('Meter Number | Street (Count)')
print('=============|===============')
for key in groups:
conflicts = distinct(groups.get(key), 'Street').most_common()
street, count = conflicts[0]
print('{:<12} | {} x {}'.format(key, count, street))
for street, count in conflicts[1:]:
print(' | {} x {}'.format(count, street))
print('-------------|---------------')
Meter Number | Street (Count)
=============|===============
144-3942 | 1 x WEST 42 STREET
| 1 x WEST 42 ST
| 1 x WEST 42ND STREET
| 1 x W 42 STREET
-------------|---------------
143-5293 | 2 x W 45 ST
| 1 x Columbus Ave
| 1 x W 45th St
-------------|---------------
144-6383 | 8 x 9TH AVE
| 1 x 9TH AVENUE
-------------|---------------
144-3937 | 5 x WEST 42 STREET
| 1 x WEST 42 ST
| 1 x W 42ND ST
| 1 x WEST 42ND
| 1 x W 42 ST
-------------|---------------
143-3785 | 3 x WEST 43RD ST
| 3 x W 43 ST
| 2 x WEST 43 ST
| 2 x W 43RD ST
| 1 x WEST 43RD STREET
| 1 x W 43 CARTER HOTEL
-------------|---------------
144-6376 | 3 x 8TH AVENUE
| 2 x 8TH AVE
-------------|---------------
144-6001 | 1 x WEST 38 ST
| 1 x W 38 ST
| 1 x W 38th St
-------------|---------------
144-6089 | 1 x S/W/C/O W 36 ST
| 1 x 8TH AVENUE
| 1 x S/W C/O W 36 ST
-------------|---------------
143-3791 | 2 x WEST 46 ST
| 1 x W 46 ST
-------------|---------------
114-9979 | 1 x GROVE ST
| 1 x GROVE STREET
-------------|---------------
144-3958 | 2 x WEST 41ST STREET
| 1 x 7TH AVENUE
| 1 x WEST 41 ST
-------------|---------------
144-6453 | 2 x BROADWAY
| 1 x C/O 41ST STREET
-------------|---------------
144-3955 | 4 x W 41 ST
| 3 x W 41ST STREET
| 1 x TIMES SQUARE
-------------|---------------
144-5988 | 1 x W 38
| 1 x W 38TH ST
-------------|---------------
144-6088 | 4 x WEST 36 STREET
| 2 x W 36TH ST
-------------|---------------
143-3786 | 1 x WEST 44TH STREET
| 1 x WEST 44 STREET
-------------|---------------
144-6377 | 2 x 8TH AVENUE
| 1 x 35 ST
| 1 x W 35 ST
| 1 x 8TH AVE
-------------|---------------
143-5983 | 4 x WEST 43RD STREET
| 2 x W 43RD STREET
-------------|---------------
140-5816 | 1 x 8TH AVE
| 1 x WEST 45TH STREET
| 1 x WEST 45 ST
-------------|---------------
105-8347 | 1 x WEST 181 ST
| 1 x W 181 ST
-------------|---------------
140-5756 | 1 x WEST 50TH STREET
| 1 x W 50th St
-------------|---------------
143-3793 | 1 x WEST 47TH ST
| 1 x W 47 STREET
-------------|---------------
117-5778 | 2 x CENTRAL PARK SOUTH
| 1 x Central Park South
-------------|---------------
144-6047 | 1 x WEST 37TH ST
| 1 x WEST 37
-------------|---------------
140-6386 | 2 x 8TH AVE
| 2 x 8TH AVENUE
-------------|---------------
144-3957 | 1 x WEST 41ST STREET
| 1 x WEST 41 STREET
-------------|---------------
143-3787 | 2 x WEST 44 STREET
| 1 x W 44TH STREET
-------------|---------------
144-3959 | 1 x S/S CO. 41 ST
| 1 x WEST 41 ST
-------------|---------------
140-5716 | 1 x C/O 41 ST
| 1 x W 50th St
-------------|---------------
143-3767 | 1 x S/S E 52 ST
| 1 x Madison Ave
-------------|---------------
140-9944 | 2 x 8TH AVENUE
| 1 x 8TH AVE
-------------|---------------
114-9970 | 1 x W 181ST ST
| 1 x CHRISTOPHER ST
-------------|---------------
105-8390 | 1 x BROADWAY
| 1 x Broadway
-------------|---------------
119-4780 | 2 x Lexington Ave
| 1 x LEXINGTON AVE
-------------|---------------
120-8052 | 2 x E 106 ST
| 2 x E 106 STREET
-------------|---------------
431-3003 | 1 x Steinway St
| 1 x Jamaica Ave
-------------|---------------
143-5694 | 3 x W 44TH ST
| 1 x W 44 ST
-------------|---------------
144-6601 | 1 x WEST 32 STREET
| 1 x W 32 ST
-------------|---------------
301-3609 | 2 x SCHERMERHORN STREET
| 1 x SCHERMERHORN ST
-------------|---------------
105-8346 | 1 x C/O W 181
| 1 x W 181ST ST
-------------|---------------
140-9954 | 1 x W 152 ST
| 1 x W 52nd St
-------------|---------------
145-7412 | 1 x E 19th St
| 1 x E 18th St
-------------|---------------
103-4489 | 1 x Columbus Ave
| 1 x 8th Ave
-------------|---------------
301-3678 | 1 x State St
| 1 x Front St
-------------|---------------
143-3901 | 1 x Madison Ave
| 1 x E 59th St
-------------|---------------
407-2167 | 1 x 38th St
| 1 x 35th Ave
-------------|---------------
201-3042 | 1 x Tiebout Ave
| 1 x Jerome Ave
| 1 x W 183rd St
-------------|---------------
407-2206 | 1 x 31st Ave
| 1 x 37th St
-------------|---------------
201-3012 | 1 x Jerome Ave
| 1 x Grand Ave
-------------|---------------
226-2760 | 1 x E 170th St
| 1 x Broadway
-------------|---------------
[10]:
# The Plate ID and Registration State should identify a vehicle uniquely. Here
# we focus on the vehicle color.
#
# Find violations of the FD ['Plate ID', 'Registration State'] -> ['Vehicle Color']
groups = fd_violations(df, lhs=['Plate ID', 'Registration State'], rhs='Vehicle Color')
[11]:
# List Plate ID and Registration State for vehicles that have multiple colors in the dataset
# together with the set of different colors.
from collections import Counter
for key in groups:
conflicts = Counter(groups.get(key)['Vehicle Color'].value_counts().keys()).most_common()
c_format = ', '.join(['{}: {}'.format(val, cnt) for val, cnt in conflicts])
print('{} = [{}]'.format(key, c_format))
('63272JM', 'NY') = [BROWN: 1, BWN: 1]
('99308MC', 'NY') = [WHITE: 1, WH: 1]
('DNB3070', 'NY') = [BLK: 1, BLACK: 1]
('95743JM', 'NY') = [BL: 1, BLUE: 1]
('63677JM', 'NY') = [BRN: 1, BROWN: 1]
('87071JS', 'NY') = [WH: 1, WHITE: 1]