Engine - Datastore

This is an example notebook to demonstrate how to update and maintain dataset versions using the openclean engine.

[1]:
# Crete an persistent instance of the openclean engine. All the data
# files will be maintained in a sub-folder of the current working directory.

from openclean.engine.base import DB

db = DB(basedir='./archive', create=True)
[2]:
# Download an test dataset from the Socrata API:
# 'Bidders List Master' from domain data.vermont.gov

from openclean.data.source.socrata import Socrata

df = Socrata().dataset('y343-ur4c').load()
df
[2]:
Date Title or Project Bidders Name City State Location 1
0 07/26/2017 HE - NARCAN 4MG 7.25 Adapt Pharma RADNOR PA RADNOR, PA\n(40.038043, -75.344449)
1 08/22/2017 Lab Bend Fixture NaN NaN NaN NaN
2 09/19/2017 FTA Cards for DPS GE Healthcare Marborough MA Marborough, MA
3 09/26/2017 02140-785 - Rescue Equipment Reynolds & Son Barre VT Barre, VT\n(44.200603, -72.505569)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
... ... ... ... ... ... ...
190 08/09/2017 BGS - AHS Janitorial Services - St Albans, VT Loso's Professional J.S. South Burlington VT South Burlington, VT\n(44.468286, -73.171594)
191 09/06/2017 BGS - A & E Window Restoration - 133 State St VT Architects Collaborative Randolph VT Randolph, VT\n(43.925266, -72.665754)
192 09/07/2017 VDH - Car Seats Even Flo Cullman AL Cullman, AL\n(34.173753, -86.843115)
193 07/24/2017 RFP DMV Registration Renewal Forms RR Donnelley Derry NH Derry, NH\n(42.881978, -71.324171)
194 08/03/2017 02140-331 - Enclosed snowmobile trailers Perfection Motorsports Richmond VT Richmond, VT\n(44.405247, -72.992905)

195 rows × 6 columns

[3]:
# Create a new persistent dataset archive from the downloaded
# data frame in the openclean engine database.

db.create(df, name='bidders', primary_key='Title or Project')
[3]:
<openclean.engine.dataset.FullDataset at 0x7f097e25e7c0>
[4]:
# Delete rows where bidder's name is empty.

from openclean.function.eval.null import IsEmpty
from openclean.operator.transform.filter import delete

db.commit(name='bidders', df=delete(df, IsEmpty('Bidders Name')))
[4]:
Date Title or Project Bidders Name City State Location 1
0 07/26/2017 HE - NARCAN 4MG 7.25 Adapt Pharma RADNOR PA RADNOR, PA\n(40.038043, -75.344449)
1 08/22/2017 Lab Bend Fixture NaN NaN NaN NaN
2 09/19/2017 FTA Cards for DPS GE Healthcare Marborough MA Marborough, MA
3 09/26/2017 02140-785 - Rescue Equipment Reynolds & Son Barre VT Barre, VT\n(44.200603, -72.505569)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
... ... ... ... ... ... ...
190 08/09/2017 BGS - AHS Janitorial Services - St Albans, VT Loso's Professional J.S. South Burlington VT South Burlington, VT\n(44.468286, -73.171594)
191 09/06/2017 BGS - A & E Window Restoration - 133 State St VT Architects Collaborative Randolph VT Randolph, VT\n(43.925266, -72.665754)
192 09/07/2017 VDH - Car Seats Even Flo Cullman AL Cullman, AL\n(34.173753, -86.843115)
193 07/24/2017 RFP DMV Registration Renewal Forms RR Donnelley Derry NH Derry, NH\n(42.881978, -71.324171)
194 08/03/2017 02140-331 - Enclosed snowmobile trailers Perfection Motorsports Richmond VT Richmond, VT\n(44.405247, -72.992905)

195 rows × 6 columns

[5]:
# Register existing string functions as 'user defined functions'.
from openclean.function.value.text import to_lower, to_title, to_upper

db.register.eval(name='lower')(to_lower)
db.register.eval(name='upper')(to_upper)
db.register.eval(name='capitalize')(to_title)

# Print function registry.
db.library.functions().to_listing()
[5]:
[{'name': 'lower',
  'namespace': None,
  'columns': 1,
  'columnLabels': ['value'],
  'outputs': 1,
  'parameters': []},
 {'name': 'upper',
  'namespace': None,
  'columns': 1,
  'columnLabels': ['value'],
  'outputs': 1,
  'parameters': []},
 {'name': 'capitalize',
  'namespace': None,
  'columns': 1,
  'columnLabels': ['value'],
  'outputs': 1,
  'parameters': []}]
[6]:
# Convert values in the 'Bidders Name' column to lower case.

db.dataset(name='bidders').update(columns='Bidders Name', func=db.library.functions().get('lower'))
[6]:
Date Title or Project Bidders Name City State Location 1
0 07/26/2017 HE - NARCAN 4MG 7.25 adapt pharma RADNOR PA RADNOR, PA\n(40.038043, -75.344449)
1 08/22/2017 Lab Bend Fixture NaN NaN NaN NaN
2 09/19/2017 FTA Cards for DPS ge healthcare Marborough MA Marborough, MA
3 09/26/2017 02140-785 - Rescue Equipment reynolds & son Barre VT Barre, VT\n(44.200603, -72.505569)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... s t paving , inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
... ... ... ... ... ... ...
190 08/09/2017 BGS - AHS Janitorial Services - St Albans, VT loso's professional j.s. South Burlington VT South Burlington, VT\n(44.468286, -73.171594)
191 09/06/2017 BGS - A & E Window Restoration - 133 State St vt architects collaborative Randolph VT Randolph, VT\n(43.925266, -72.665754)
192 09/07/2017 VDH - Car Seats even flo Cullman AL Cullman, AL\n(34.173753, -86.843115)
193 07/24/2017 RFP DMV Registration Renewal Forms rr donnelley Derry NH Derry, NH\n(42.881978, -71.324171)
194 08/03/2017 02140-331 - Enclosed snowmobile trailers perfection motorsports Richmond VT Richmond, VT\n(44.405247, -72.992905)

195 rows × 6 columns

[7]:
# Show operations in the current dataset log.

for op in db.dataset('bidders').log():
    print(op.descriptor)
{'optype': 'load', 'columns': None}
{'optype': 'commit', 'columns': None}
{'optype': 'update', 'columns': ['Bidders Name'], 'name': 'lower'}
[8]:
# Add a user-defined function as a simple column-operator.
# register.eval adds a method to the object that is returned
# by db.apply that currently expects a single argument named
# 'columns' which specifies the column(s) on which the registered
# function is applied (evaluated) to create a modified dataset.

@db.register.eval('zigzag')
def zigzag_case(value):
    """Take a given string and return a string where
    upper and lower cases alternate.
    """
    result = ''
    functions = [str.upper, str.lower]
    i = 0
    for c in str(value):
        f = functions[i]
        i = (i + 1) % 2
        result += f(c)
    return result

Notebook Spreadsheet UI

The following steps simulate some of the interactions that a user has with a dataset sample via the spreadsheet UI for Jupyter Notebooks.

[9]:
# Take a dataset sample of 10 rows.

db.sample(name='bidders', n=10, random_state=43)
[9]:
Date Title or Project Bidders Name City State Location 1
110 07/25/2017 HE RESPIRATOR FIT TESTER tsi inc. SHOREVIEW MN SHOREVIEW, MN\n(45.081261, -93.134983)
165 08/23/2017 BGS - BGS Snow Removal - Kengar State Office B... d tatro construction Hyde Park VT Hyde Park, VT\n(44.593002, -72.61145)
10 08/25/2017 Brine Maker gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
144 08/30/2017 BGS - Snow Removal Services for Middlesex Gene... g & n excavation Moretown VT Moretown, VT\n(44.250919, -72.761246)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... s t paving , inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
15 08/14/2017 02140-779 - Firefighter Helmets reynolds & son South Barre VT South Barre, VT\n(44.177155, -72.504898)
25 08/25/2017 Brine Storage Tanks gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
24 08/30/2017 Rolled Steel Plates chillicothe steel Chillicothe OH Chillicothe, OH\n(39.331846, -82.981776)
96 08/04/2017 Ballistic Panels executive wood products Sullivan MO Sullivan, MO\n(38.213599, -91.16411)
111 08/25/2017 MIL - 02 System Design - VT Veteran's Home - B... goldstone architect Bennington VT Bennington, VT\n(42.878372, -73.19709)
[10]:
# Capitalize values in the 'Bidders Name' columns.

db.dataset(name='bidders').update(columns='Bidders Name', func=db.library.functions().get('capitalize'))
[10]:
Date Title or Project Bidders Name City State Location 1
110 07/25/2017 HE RESPIRATOR FIT TESTER Tsi Inc. SHOREVIEW MN SHOREVIEW, MN\n(45.081261, -93.134983)
165 08/23/2017 BGS - BGS Snow Removal - Kengar State Office B... D Tatro Construction Hyde Park VT Hyde Park, VT\n(44.593002, -72.61145)
10 08/25/2017 Brine Maker Gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
144 08/30/2017 BGS - Snow Removal Services for Middlesex Gene... G & N Excavation Moretown VT Moretown, VT\n(44.250919, -72.761246)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
15 08/14/2017 02140-779 - Firefighter Helmets Reynolds & Son South Barre VT South Barre, VT\n(44.177155, -72.504898)
25 08/25/2017 Brine Storage Tanks Gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
24 08/30/2017 Rolled Steel Plates Chillicothe Steel Chillicothe OH Chillicothe, OH\n(39.331846, -82.981776)
96 08/04/2017 Ballistic Panels Executive Wood Products Sullivan MO Sullivan, MO\n(38.213599, -91.16411)
111 08/25/2017 MIL - 02 System Design - VT Veteran's Home - B... Goldstone Architect Bennington VT Bennington, VT\n(42.878372, -73.19709)
[11]:
# Apply the zig-zag function to column 'City'

db.dataset(name='bidders').update(columns='City', func=zigzag_case)
[11]:
Date Title or Project Bidders Name City State Location 1
110 07/25/2017 HE RESPIRATOR FIT TESTER Tsi Inc. ShOrEvIeW MN SHOREVIEW, MN\n(45.081261, -93.134983)
165 08/23/2017 BGS - BGS Snow Removal - Kengar State Office B... D Tatro Construction HyDe pArK VT Hyde Park, VT\n(44.593002, -72.61145)
10 08/25/2017 Brine Maker Gvm EaSt bErLiN PA East Berlin, PA\n(39.937556, -76.980498)
144 08/30/2017 BGS - Snow Removal Services for Middlesex Gene... G & N Excavation MoReToWn VT Moretown, VT\n(44.250919, -72.761246)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc WaTeRbUrY VT Waterbury, VT\n(44.334602, -72.753189)
15 08/14/2017 02140-779 - Firefighter Helmets Reynolds & Son SoUtH BaRrE VT South Barre, VT\n(44.177155, -72.504898)
25 08/25/2017 Brine Storage Tanks Gvm EaSt bErLiN PA East Berlin, PA\n(39.937556, -76.980498)
24 08/30/2017 Rolled Steel Plates Chillicothe Steel ChIlLiCoThE OH Chillicothe, OH\n(39.331846, -82.981776)
96 08/04/2017 Ballistic Panels Executive Wood Products SuLlIvAn MO Sullivan, MO\n(38.213599, -91.16411)
111 08/25/2017 MIL - 02 System Design - VT Veteran's Home - B... Goldstone Architect BeNnInGtOn VT Bennington, VT\n(42.878372, -73.19709)
[12]:
# Show operations in the current dataset log (recipe). Note that
# each log entry has a unique identifier that is used to reference
# the represented dataset snapshot in checkout() and rollback()
# operations.

snapshots = list()
for op in db.dataset('bidders').log():
    print('{} {}'.format(op.version, op.descriptor))
    snapshots.append(op.version)
0 {'optype': 'sample', 'columns': None, 'arguments': [{'name': 'n', 'value': 10}, {'name': 'randomState', 'value': 43}]}
1 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
2 {'optype': 'update', 'columns': ['City'], 'name': 'zigzag'}
[13]:
# Print snapshot that resulted from the second operation
# in the recipe, i.e., the capitalize operation on the
# 'Bidders Name'. City names should not be zig-zag in this
# snapshot.

db.dataset('bidders').checkout(snapshots[1])
[13]:
Date Title or Project Bidders Name City State Location 1
110 07/25/2017 HE RESPIRATOR FIT TESTER Tsi Inc. SHOREVIEW MN SHOREVIEW, MN\n(45.081261, -93.134983)
165 08/23/2017 BGS - BGS Snow Removal - Kengar State Office B... D Tatro Construction Hyde Park VT Hyde Park, VT\n(44.593002, -72.61145)
10 08/25/2017 Brine Maker Gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
144 08/30/2017 BGS - Snow Removal Services for Middlesex Gene... G & N Excavation Moretown VT Moretown, VT\n(44.250919, -72.761246)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
15 08/14/2017 02140-779 - Firefighter Helmets Reynolds & Son South Barre VT South Barre, VT\n(44.177155, -72.504898)
25 08/25/2017 Brine Storage Tanks Gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
24 08/30/2017 Rolled Steel Plates Chillicothe Steel Chillicothe OH Chillicothe, OH\n(39.331846, -82.981776)
96 08/04/2017 Ballistic Panels Executive Wood Products Sullivan MO Sullivan, MO\n(38.213599, -91.16411)
111 08/25/2017 MIL - 02 System Design - VT Veteran's Home - B... Goldstone Architect Bennington VT Bennington, VT\n(42.878372, -73.19709)
[14]:
# Remove the zig-zag operation from the dataset history
# by rolling back to the previous operation.

db.dataset('bidders').rollback(snapshots[1])
[14]:
Date Title or Project Bidders Name City State Location 1
110 07/25/2017 HE RESPIRATOR FIT TESTER Tsi Inc. SHOREVIEW MN SHOREVIEW, MN\n(45.081261, -93.134983)
165 08/23/2017 BGS - BGS Snow Removal - Kengar State Office B... D Tatro Construction Hyde Park VT Hyde Park, VT\n(44.593002, -72.61145)
10 08/25/2017 Brine Maker Gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
144 08/30/2017 BGS - Snow Removal Services for Middlesex Gene... G & N Excavation Moretown VT Moretown, VT\n(44.250919, -72.761246)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
15 08/14/2017 02140-779 - Firefighter Helmets Reynolds & Son South Barre VT South Barre, VT\n(44.177155, -72.504898)
25 08/25/2017 Brine Storage Tanks Gvm East Berlin PA East Berlin, PA\n(39.937556, -76.980498)
24 08/30/2017 Rolled Steel Plates Chillicothe Steel Chillicothe OH Chillicothe, OH\n(39.331846, -82.981776)
96 08/04/2017 Ballistic Panels Executive Wood Products Sullivan MO Sullivan, MO\n(38.213599, -91.16411)
111 08/25/2017 MIL - 02 System Design - VT Veteran's Home - B... Goldstone Architect Bennington VT Bennington, VT\n(42.878372, -73.19709)
[15]:
# Show remaining operations in the current dataset log.

for op in db.dataset('bidders').log():
    print('{} {}'.format(op.version, op.descriptor))
    snapshots.append(op.version)
0 {'optype': 'sample', 'columns': None, 'arguments': [{'name': 'n', 'value': 10}, {'name': 'randomState', 'value': 43}]}
1 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
[16]:
# Apply changes to the full dataset.

db.checkout('bidders', commit=True)
[16]:
Date Title or Project Bidders Name City State Location 1
0 07/26/2017 HE - NARCAN 4MG 7.25 Adapt Pharma RADNOR PA RADNOR, PA\n(40.038043, -75.344449)
1 08/22/2017 Lab Bend Fixture NaN NaN NaN NaN
2 09/19/2017 FTA Cards for DPS Ge Healthcare Marborough MA Marborough, MA
3 09/26/2017 02140-785 - Rescue Equipment Reynolds & Son Barre VT Barre, VT\n(44.200603, -72.505569)
4 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... S T Paving , Inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
... ... ... ... ... ... ...
190 08/09/2017 BGS - AHS Janitorial Services - St Albans, VT Loso'S Professional J.S. South Burlington VT South Burlington, VT\n(44.468286, -73.171594)
191 09/06/2017 BGS - A & E Window Restoration - 133 State St Vt Architects Collaborative Randolph VT Randolph, VT\n(43.925266, -72.665754)
192 09/07/2017 VDH - Car Seats Even Flo Cullman AL Cullman, AL\n(34.173753, -86.843115)
193 07/24/2017 RFP DMV Registration Renewal Forms Rr Donnelley Derry NH Derry, NH\n(42.881978, -71.324171)
194 08/03/2017 02140-331 - Enclosed snowmobile trailers Perfection Motorsports Richmond VT Richmond, VT\n(44.405247, -72.992905)

195 rows × 6 columns

[17]:
# Show operations in the resulting dataset log.

for op in db.dataset('bidders').log():
    print(op.version, op.descriptor)
0 {'optype': 'load', 'columns': None}
1 {'optype': 'commit', 'columns': None}
2 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'lower'}
3 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}

Rollback Changes in Persistent Archive

[18]:
# Rollback changes in the persistent archive to version 2.

df = db.rollback('bidders', version=2)
df
[18]:
Date Title or Project Bidders Name City State Location 1
143 07/26/2017 HE - NARCAN 4MG 7.25 adapt pharma RADNOR PA RADNOR, PA\n(40.038043, -75.344449)
145 08/22/2017 Lab Bend Fixture None None None None
138 09/19/2017 FTA Cards for DPS ge healthcare Marborough MA Marborough, MA
24 09/26/2017 02140-785 - Rescue Equipment reynolds & son Barre VT Barre, VT\n(44.200603, -72.505569)
59 09/20/2017 BGS - 170078 Trush Parking Lot Paving Repairs ... s t paving , inc Waterbury VT Waterbury, VT\n(44.334602, -72.753189)
... ... ... ... ... ... ...
64 08/09/2017 BGS - AHS Janitorial Services - St Albans, VT loso's professional j.s. South Burlington VT South Burlington, VT\n(44.468286, -73.171594)
62 09/06/2017 BGS - A & E Window Restoration - 133 State St vt architects collaborative Randolph VT Randolph, VT\n(43.925266, -72.665754)
179 09/07/2017 VDH - Car Seats even flo Cullman AL Cullman, AL\n(34.173753, -86.843115)
159 07/24/2017 RFP DMV Registration Renewal Forms rr donnelley Derry NH Derry, NH\n(42.881978, -71.324171)
3 08/03/2017 02140-331 - Enclosed snowmobile trailers perfection motorsports Richmond VT Richmond, VT\n(44.405247, -72.992905)

195 rows × 6 columns