Functional Dependency Violations

Example showing how to detect functional dependency violations. Uses the NYC Parking Violations Issued - Fiscal Year 2014 dataset to identify violations of the functional dependency Meter Number -> Registration State, Street.

[1]:
# Download the full 'DOB Job Application Fiings' dataset.
# Note that the fill is over 300MB in size.

import gzip
import os

from openclean.data.source.socrata import Socrata

datafile = './jt7v-77mi.tsv.gz'

# Download file only if it does not exist already.
if not os.path.isfile(datafile):
    with gzip.open(datafile, 'wb') as f:
        ds = Socrata().dataset('jt7v-77mi')
        print('Downloading ...\n')
        print(ds.name + '\n')
        print(ds.description)
        ds.write(f)


# As an alternative, you can also use the smaller dataset sample that is
# included in the repository.
#
# datafile = './data/jt7v-77mi.tsv.gz'
[2]:
# Verify that the download was successful. Print dataset columns and number of rows.
# This example makes use of the streaming option to avoid loading the full data frame
# into memory.

from openclean.pipeline import stream

ds = stream(datafile)


print('Schema\n------')
for col in ds.columns:
    print("  '{}'".format(col))

print('\n{} rows.'.format(ds.count()))
Schema
------
  'Summons Number'
  'Plate ID'
  'Registration State'
  'Plate Type'
  'Issue Date'
  'Violation Code'
  'Vehicle Body Type'
  'Vehicle Make'
  'Issuing Agency'
  'Street Code1'
  'Street Code2'
  'Street Code3'
  'Vehicle Expiration Date'
  'Violation Location'
  'Violation Precinct'
  'Issuer Precinct'
  'Issuer Code'
  'Issuer Command'
  'Issuer Squad'
  'Violation Time'
  'Time First Observed'
  'Violation County'
  'Violation In Front Of Or Opposite'
  'Number'
  'Street'
  'Intersecting Street'
  'Date First Observed'
  'Law Section'
  'Sub Division'
  'Violation Legal Code'
  'Days Parking In Effect    '
  'From Hours In Effect'
  'To Hours In Effect'
  'Vehicle Color'
  'Unregistered Vehicle?'
  'Vehicle Year'
  'Meter Number'
  'Feet From Curb'
  'Violation Post Code'
  'Violation Description'
  'No Standing or Stopping Violation'
  'Hydrant Violation'
  'Double Parking Violation'

9100278 rows.
[3]:
# Profile a sample of 1000 rows using the default profiler.

profiles = ds.sample(n=1000, random_state=42).profile()
profiles
[3]:
[{'column': 'Summons Number',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 1286655493, 'maximum': 8004852361}}}},
 {'column': 'Plate ID',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 982, 'int': 18})}),
   'minmaxValues': {'str': {'minimum': '019KWM', 'maximum': 'ZFF73E'},
    'int': {'minimum': 22, 'maximum': 8184138}}}},
 {'column': 'Registration State',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 998, 'int': 2})}),
   'minmaxValues': {'str': {'minimum': 'AZ', 'maximum': 'VT'},
    'int': {'minimum': 99, 'maximum': 99}}}},
 {'column': 'Plate Type',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 994, 'int': 6})}),
   'minmaxValues': {'str': {'minimum': 'APP', 'maximum': 'TRC'},
    'int': {'minimum': 999, 'maximum': 999}}}},
 {'column': 'Issue Date',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'date': 1000})}),
   'minmaxValues': {'date': {'minimum': datetime.datetime(2012, 12, 10, 0, 0),
     'maximum': datetime.datetime(2014, 6, 25, 0, 0)}}}},
 {'column': 'Violation Code',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 5, 'maximum': 99}}}},
 {'column': 'Vehicle Body Type',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 7,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 993})}),
   'minmaxValues': {'str': {'minimum': '2 DR', 'maximum': 'VAN'}}}},
 {'column': 'Vehicle Make',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 9,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 991})}),
   'minmaxValues': {'str': {'minimum': 'ACURA', 'maximum': 'WORKH'}}}},
 {'column': 'Issuing Agency',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 1000})}),
   'minmaxValues': {'str': {'minimum': 'P', 'maximum': 'X'}}}},
 {'column': 'Street Code1',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 95030}}}},
 {'column': 'Street Code2',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 97740}}}},
 {'column': 'Street Code3',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 98260}}}},
 {'column': 'Vehicle Expiration Date',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 88888888}}}},
 {'column': 'Violation Location',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 72,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 928})}),
   'minmaxValues': {'int': {'minimum': 1, 'maximum': 122}}}},
 {'column': 'Violation Precinct',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 122}}}},
 {'column': 'Issuer Precinct',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 977}}}},
 {'column': 'Issuer Code',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 981230}}}},
 {'column': 'Issuer Command',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 69,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 797, 'int': 134})}),
   'minmaxValues': {'str': {'minimum': '0NIC', 'maximum': 'T803'},
    'int': {'minimum': 0, 'maximum': 977}}}},
 {'column': 'Issuer Squad',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 69,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 779, 'int': 152})}),
   'minmaxValues': {'str': {'minimum': 'A', 'maximum': 'YA'},
    'int': {'minimum': 0, 'maximum': 0}}}},
 {'column': 'Violation Time',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 1,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 999})}),
   'minmaxValues': {'str': {'minimum': '0014A', 'maximum': '1259P'}}}},
 {'column': 'Time First Observed',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 896,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 104})}),
   'minmaxValues': {'str': {'minimum': '0107P', 'maximum': '1257P'}}}},
 {'column': 'Violation County',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 72,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 928})}),
   'minmaxValues': {'str': {'minimum': 'BX', 'maximum': 'R'}}}},
 {'column': 'Violation In Front Of Or Opposite',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 87,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 913})}),
   'minmaxValues': {'str': {'minimum': 'F', 'maximum': 'X'}}}},
 {'column': 'Number',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 98,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 279, 'int': 623})}),
   'minmaxValues': {'str': {'minimum': '10-16', 'maximum': 'W'},
    'int': {'minimum': 1, 'maximum': 16305}}}},
 {'column': 'Street',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 1,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 999})}),
   'minmaxValues': {'str': {'minimum': '101st St', 'maximum': 'Wythe Ave'}}}},
 {'column': 'Intersecting Street',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 798,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 202})}),
   'minmaxValues': {'str': {'minimum': ')@ALLERTON AVE-XA-2',
     'maximum': 'Y SVC RD @ 72ND RD'}}}},
 {'column': 'Date First Observed',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 20140618}}}},
 {'column': 'Law Section',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 408, 'maximum': 1180}}}},
 {'column': 'Sub Division',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 998, 'int': 2})}),
   'minmaxValues': {'str': {'minimum': 'B', 'maximum': 'm5'},
    'int': {'minimum': 5, 'maximum': 99}}}},
 {'column': 'Violation Legal Code',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 930,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 69, 'int': 1})}),
   'minmaxValues': {'str': {'minimum': 'T', 'maximum': 'T'},
    'int': {'minimum': 0, 'maximum': 0}}}},
 {'column': 'Days Parking In Effect    ',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 155,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 845})}),
   'minmaxValues': {'str': {'minimum': 'BBBBBBB', 'maximum': 'YYYYYYY'}}}},
 {'column': 'From Hours In Effect',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 366,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 634})}),
   'minmaxValues': {'str': {'minimum': '0  :', 'maximum': 'ALL'}}}},
 {'column': 'To Hours In Effect',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 366,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 634})}),
   'minmaxValues': {'str': {'minimum': '0  :', 'maximum': 'ALL'}}}},
 {'column': 'Vehicle Color',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 11,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 989})}),
   'minmaxValues': {'str': {'minimum': 'BK', 'maximum': 'YW'}}}},
 {'column': 'Unregistered Vehicle?',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 848,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 152})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 0}}}},
 {'column': 'Vehicle Year',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 2015}}}},
 {'column': 'Meter Number',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 751,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 249})}),
   'minmaxValues': {'str': {'minimum': '-', 'maximum': '495-0067'}}}},
 {'column': 'Feet From Curb',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 0,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'int': 1000})}),
   'minmaxValues': {'int': {'minimum': 0, 'maximum': 10}}}},
 {'column': 'Violation Post Code',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 220,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 743, 'int': 37})}),
   'minmaxValues': {'str': {'minimum': '01 -', 'maximum': 'Y 99'},
    'int': {'minimum': 0, 'maximum': 131}}}},
 {'column': 'Violation Description',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 152,
   'datatypes': defaultdict(collections.Counter,
               {'total': Counter({'str': 848})}),
   'minmaxValues': {'str': {'minimum': '09-Blocking the Box',
     'maximum': 'PHTO SCHOOL ZN SPEED VIOLATION'}}}},
 {'column': 'No Standing or Stopping Violation',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 1000,
   'datatypes': defaultdict(collections.Counter, {}),
   'minmaxValues': {}}},
 {'column': 'Hydrant Violation',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 1000,
   'datatypes': defaultdict(collections.Counter, {}),
   'minmaxValues': {}}},
 {'column': 'Double Parking Violation',
  'stats': {'totalValueCount': 1000,
   'emptyValueCount': 1000,
   'datatypes': defaultdict(collections.Counter, {}),
   'minmaxValues': {}}}]
[4]:
# Print number of empty cells for each column

profiles.stats()['empty']
[4]:
Summons Number                          0
Plate ID                                0
Registration State                      0
Plate Type                              0
Issue Date                              0
Violation Code                          0
Vehicle Body Type                       7
Vehicle Make                            9
Issuing Agency                          0
Street Code1                            0
Street Code2                            0
Street Code3                            0
Vehicle Expiration Date                 0
Violation Location                     72
Violation Precinct                      0
Issuer Precinct                         0
Issuer Code                             0
Issuer Command                         69
Issuer Squad                           69
Violation Time                          1
Time First Observed                   896
Violation County                       72
Violation In Front Of Or Opposite      87
Number                                 98
Street                                  1
Intersecting Street                   798
Date First Observed                     0
Law Section                             0
Sub Division                            0
Violation Legal Code                  930
Days Parking In Effect                155
From Hours In Effect                  366
To Hours In Effect                    366
Vehicle Color                          11
Unregistered Vehicle?                 848
Vehicle Year                            0
Meter Number                          751
Feet From Curb                          0
Violation Post Code                   220
Violation Description                 152
No Standing or Stopping Violation    1000
Hydrant Violation                    1000
Double Parking Violation             1000
Name: empty, dtype: int64
[5]:
# Show minimum and maximum value for column 'Meter Number'. We see
# that the column not only contains a lot of emoty values but also
# '-' as an alternative representation for a missing value.

profiles.minmax('Meter Number')
[5]:
min max
str - 495-0067
[6]:
# Get the first 1000 rows. Ignore rows where the meter number is undefined (i.e., either
# an empty string or '-'). Convert the result into a data frame.

from openclean.function.eval.domain import IsNotIn

# We only select a subset of columns for this demo.
columns = [
    'Plate ID',
    'Registration State',
    'Plate Type',
    'Meter Number',
    'Street',
    'Vehicle Body Type',
    'Vehicle Make',
    'Vehicle Color'
]

df = ds\
    .select(columns)\
    .where(IsNotIn('Meter Number', set({'-', ''})), limit=1000)\
    .to_df()

df.head()
[6]:
Plate ID Registration State Plate Type Meter Number Street Vehicle Body Type Vehicle Make Vehicle Color
661 FXY1858 NY PAS 407-3018 QUEENS BLVD SDN NISSA GY
780 89988JX NY COM 3 - FRESH POND TRD VAN FORD WHITE
901 FGX2747 NY PAS 504-3043 SDN HONDA SILVE
2287 23161JR NY COM 144-3942 WEST 42 STREET P-U FORD WHITE
2346 47153MC NY COM 144-3987 W 40TH ST SDN TOYOT SILV
[7]:
# Find violations of the functional dependency Meter Number -> Street.

from openclean.operator.map.violations import fd_violations

groups = fd_violations(df, lhs='Meter Number', rhs='Street')
[8]:
# List meter numbers that have violations and the number of
# violating values.

for key in groups:
    print('{} {}'.format(key, groups.get(key).shape[0]))
144-3942 4
143-5293 4
144-6383 9
144-3937 9
143-3785 12
144-6376 5
144-6001 3
144-6089 3
143-3791 3
114-9979 2
144-3958 4
144-6453 3
144-3955 8
144-5988 2
144-6088 6
143-3786 2
144-6377 5
143-5983 6
140-5816 3
105-8347 2
140-5756 2
143-3793 2
117-5778 3
144-6047 2
140-6386 4
144-3957 2
143-3787 3
144-3959 2
140-5716 2
143-3767 2
140-9944 3
114-9970 2
105-8390 2
119-4780 3
120-8052 4
431-3003 2
143-5694 4
144-6601 2
301-3609 3
105-8346 2
140-9954 2
145-7412 2
103-4489 2
301-3678 2
143-3901 2
407-2167 2
201-3042 3
407-2206 2
201-3012 2
226-2760 2
[9]:
# Show street names that cause violations of the functional dependency.

from openclean.operator.collector.count import distinct

print('Meter Number | Street (Count)')
print('=============|===============')
for key in groups:
    conflicts = distinct(groups.get(key), 'Street').most_common()
    street, count = conflicts[0]
    print('{:<12} | {} x {}'.format(key, count, street))
    for street, count in conflicts[1:]:
        print('             | {} x {}'.format(count, street))
    print('-------------|---------------')
Meter Number | Street (Count)
=============|===============
144-3942     | 1 x WEST 42 STREET
             | 1 x WEST 42 ST
             | 1 x WEST 42ND STREET
             | 1 x W 42 STREET
-------------|---------------
143-5293     | 2 x W 45 ST
             | 1 x Columbus Ave
             | 1 x W 45th St
-------------|---------------
144-6383     | 8 x 9TH AVE
             | 1 x 9TH AVENUE
-------------|---------------
144-3937     | 5 x WEST 42 STREET
             | 1 x WEST 42 ST
             | 1 x W 42ND ST
             | 1 x WEST 42ND
             | 1 x W 42 ST
-------------|---------------
143-3785     | 3 x WEST 43RD ST
             | 3 x W 43 ST
             | 2 x WEST 43 ST
             | 2 x W 43RD ST
             | 1 x WEST 43RD STREET
             | 1 x W 43 CARTER HOTEL
-------------|---------------
144-6376     | 3 x 8TH AVENUE
             | 2 x 8TH AVE
-------------|---------------
144-6001     | 1 x WEST 38 ST
             | 1 x W 38 ST
             | 1 x W 38th St
-------------|---------------
144-6089     | 1 x S/W/C/O W 36 ST
             | 1 x 8TH AVENUE
             | 1 x S/W C/O W 36 ST
-------------|---------------
143-3791     | 2 x WEST 46 ST
             | 1 x W 46 ST
-------------|---------------
114-9979     | 1 x GROVE ST
             | 1 x GROVE STREET
-------------|---------------
144-3958     | 2 x WEST 41ST STREET
             | 1 x 7TH AVENUE
             | 1 x WEST 41 ST
-------------|---------------
144-6453     | 2 x BROADWAY
             | 1 x C/O 41ST STREET
-------------|---------------
144-3955     | 4 x W 41 ST
             | 3 x W 41ST STREET
             | 1 x TIMES SQUARE
-------------|---------------
144-5988     | 1 x W 38
             | 1 x W 38TH ST
-------------|---------------
144-6088     | 4 x WEST 36 STREET
             | 2 x W 36TH ST
-------------|---------------
143-3786     | 1 x WEST 44TH STREET
             | 1 x WEST 44 STREET
-------------|---------------
144-6377     | 2 x 8TH AVENUE
             | 1 x 35 ST
             | 1 x W 35 ST
             | 1 x 8TH AVE
-------------|---------------
143-5983     | 4 x WEST 43RD STREET
             | 2 x W 43RD STREET
-------------|---------------
140-5816     | 1 x 8TH AVE
             | 1 x WEST 45TH STREET
             | 1 x WEST 45 ST
-------------|---------------
105-8347     | 1 x WEST 181 ST
             | 1 x W 181 ST
-------------|---------------
140-5756     | 1 x WEST 50TH STREET
             | 1 x W 50th St
-------------|---------------
143-3793     | 1 x WEST 47TH ST
             | 1 x W 47 STREET
-------------|---------------
117-5778     | 2 x CENTRAL PARK SOUTH
             | 1 x Central Park South
-------------|---------------
144-6047     | 1 x WEST 37TH ST
             | 1 x WEST 37
-------------|---------------
140-6386     | 2 x 8TH AVE
             | 2 x 8TH AVENUE
-------------|---------------
144-3957     | 1 x WEST 41ST STREET
             | 1 x WEST 41 STREET
-------------|---------------
143-3787     | 2 x WEST 44 STREET
             | 1 x W 44TH STREET
-------------|---------------
144-3959     | 1 x S/S CO. 41 ST
             | 1 x WEST 41 ST
-------------|---------------
140-5716     | 1 x C/O 41 ST
             | 1 x W 50th St
-------------|---------------
143-3767     | 1 x S/S E 52 ST
             | 1 x Madison Ave
-------------|---------------
140-9944     | 2 x 8TH AVENUE
             | 1 x 8TH AVE
-------------|---------------
114-9970     | 1 x W 181ST ST
             | 1 x CHRISTOPHER ST
-------------|---------------
105-8390     | 1 x BROADWAY
             | 1 x Broadway
-------------|---------------
119-4780     | 2 x Lexington Ave
             | 1 x LEXINGTON AVE
-------------|---------------
120-8052     | 2 x E 106 ST
             | 2 x E 106 STREET
-------------|---------------
431-3003     | 1 x Steinway St
             | 1 x Jamaica Ave
-------------|---------------
143-5694     | 3 x W 44TH ST
             | 1 x W 44 ST
-------------|---------------
144-6601     | 1 x WEST 32 STREET
             | 1 x W 32 ST
-------------|---------------
301-3609     | 2 x SCHERMERHORN STREET
             | 1 x SCHERMERHORN ST
-------------|---------------
105-8346     | 1 x C/O W 181
             | 1 x W 181ST ST
-------------|---------------
140-9954     | 1 x W 152 ST
             | 1 x W 52nd St
-------------|---------------
145-7412     | 1 x E 19th St
             | 1 x E 18th St
-------------|---------------
103-4489     | 1 x Columbus Ave
             | 1 x 8th Ave
-------------|---------------
301-3678     | 1 x State St
             | 1 x Front St
-------------|---------------
143-3901     | 1 x Madison Ave
             | 1 x E 59th St
-------------|---------------
407-2167     | 1 x 38th St
             | 1 x 35th Ave
-------------|---------------
201-3042     | 1 x Tiebout Ave
             | 1 x Jerome Ave
             | 1 x W 183rd St
-------------|---------------
407-2206     | 1 x 31st Ave
             | 1 x 37th St
-------------|---------------
201-3012     | 1 x Jerome Ave
             | 1 x Grand Ave
-------------|---------------
226-2760     | 1 x E 170th St
             | 1 x Broadway
-------------|---------------
[10]:
# The Plate ID and Registration State should identify a vehicle uniquely. Here
# we focus on the vehicle color.
#
# Find violations of the FD ['Plate ID', 'Registration State'] -> ['Vehicle Color']
groups = fd_violations(df, lhs=['Plate ID', 'Registration State'], rhs='Vehicle Color')
[11]:
# List Plate ID and Registration State for vehicles that have multiple colors in the dataset
# together with the set of different colors.

from collections import Counter

for key in groups:
    conflicts = Counter(groups.get(key)['Vehicle Color'].value_counts().keys()).most_common()
    c_format = ', '.join(['{}: {}'.format(val, cnt) for val, cnt in conflicts])
    print('{} = [{}]'.format(key, c_format))
('63272JM', 'NY') = [BROWN: 1, BWN: 1]
('99308MC', 'NY') = [WHITE: 1, WH: 1]
('DNB3070', 'NY') = [BLK: 1, BLACK: 1]
('95743JM', 'NY') = [BL: 1, BLUE: 1]
('63677JM', 'NY') = [BRN: 1, BROWN: 1]
('87071JS', 'NY') = [WH: 1, WHITE: 1]