Engine - Datastore
This is an example notebook to demonstrate how to update and maintain dataset versions using the openclean engine.
[1]:
# Crete an persistent instance of the openclean engine. All the data
# files will be maintained in a sub-folder of the current working directory.
from openclean.engine.base import DB
db = DB(basedir='./archive', create=True)
[2]:
# Download an test dataset from the Socrata API:
# 'Bidders List Master' from domain data.vermont.gov
from openclean.data.source.socrata import Socrata
df = Socrata().dataset('y343-ur4c').load()
df
[2]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
0 | 07/26/2017 | HE - NARCAN 4MG 7.25 | Adapt Pharma | RADNOR | PA | RADNOR, PA\n(40.038043, -75.344449) |
1 | 08/22/2017 | Lab Bend Fixture | NaN | NaN | NaN | NaN |
2 | 09/19/2017 | FTA Cards for DPS | GE Healthcare | Marborough | MA | Marborough, MA |
3 | 09/26/2017 | 02140-785 - Rescue Equipment | Reynolds & Son | Barre | VT | Barre, VT\n(44.200603, -72.505569) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
... | ... | ... | ... | ... | ... | ... |
190 | 08/09/2017 | BGS - AHS Janitorial Services - St Albans, VT | Loso's Professional J.S. | South Burlington | VT | South Burlington, VT\n(44.468286, -73.171594) |
191 | 09/06/2017 | BGS - A & E Window Restoration - 133 State St | VT Architects Collaborative | Randolph | VT | Randolph, VT\n(43.925266, -72.665754) |
192 | 09/07/2017 | VDH - Car Seats | Even Flo | Cullman | AL | Cullman, AL\n(34.173753, -86.843115) |
193 | 07/24/2017 | RFP DMV Registration Renewal Forms | RR Donnelley | Derry | NH | Derry, NH\n(42.881978, -71.324171) |
194 | 08/03/2017 | 02140-331 - Enclosed snowmobile trailers | Perfection Motorsports | Richmond | VT | Richmond, VT\n(44.405247, -72.992905) |
195 rows × 6 columns
[3]:
# Create a new persistent dataset archive from the downloaded
# data frame in the openclean engine database.
db.create(df, name='bidders', primary_key='Title or Project')
[3]:
<openclean.engine.dataset.FullDataset at 0x7f097e25e7c0>
[4]:
# Delete rows where bidder's name is empty.
from openclean.function.eval.null import IsEmpty
from openclean.operator.transform.filter import delete
db.commit(name='bidders', df=delete(df, IsEmpty('Bidders Name')))
[4]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
0 | 07/26/2017 | HE - NARCAN 4MG 7.25 | Adapt Pharma | RADNOR | PA | RADNOR, PA\n(40.038043, -75.344449) |
1 | 08/22/2017 | Lab Bend Fixture | NaN | NaN | NaN | NaN |
2 | 09/19/2017 | FTA Cards for DPS | GE Healthcare | Marborough | MA | Marborough, MA |
3 | 09/26/2017 | 02140-785 - Rescue Equipment | Reynolds & Son | Barre | VT | Barre, VT\n(44.200603, -72.505569) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
... | ... | ... | ... | ... | ... | ... |
190 | 08/09/2017 | BGS - AHS Janitorial Services - St Albans, VT | Loso's Professional J.S. | South Burlington | VT | South Burlington, VT\n(44.468286, -73.171594) |
191 | 09/06/2017 | BGS - A & E Window Restoration - 133 State St | VT Architects Collaborative | Randolph | VT | Randolph, VT\n(43.925266, -72.665754) |
192 | 09/07/2017 | VDH - Car Seats | Even Flo | Cullman | AL | Cullman, AL\n(34.173753, -86.843115) |
193 | 07/24/2017 | RFP DMV Registration Renewal Forms | RR Donnelley | Derry | NH | Derry, NH\n(42.881978, -71.324171) |
194 | 08/03/2017 | 02140-331 - Enclosed snowmobile trailers | Perfection Motorsports | Richmond | VT | Richmond, VT\n(44.405247, -72.992905) |
195 rows × 6 columns
[5]:
# Register existing string functions as 'user defined functions'.
from openclean.function.value.text import to_lower, to_title, to_upper
db.register.eval(name='lower')(to_lower)
db.register.eval(name='upper')(to_upper)
db.register.eval(name='capitalize')(to_title)
# Print function registry.
db.library.functions().to_listing()
[5]:
[{'name': 'lower',
'namespace': None,
'columns': 1,
'columnLabels': ['value'],
'outputs': 1,
'parameters': []},
{'name': 'upper',
'namespace': None,
'columns': 1,
'columnLabels': ['value'],
'outputs': 1,
'parameters': []},
{'name': 'capitalize',
'namespace': None,
'columns': 1,
'columnLabels': ['value'],
'outputs': 1,
'parameters': []}]
[6]:
# Convert values in the 'Bidders Name' column to lower case.
db.dataset(name='bidders').update(columns='Bidders Name', func=db.library.functions().get('lower'))
[6]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
0 | 07/26/2017 | HE - NARCAN 4MG 7.25 | adapt pharma | RADNOR | PA | RADNOR, PA\n(40.038043, -75.344449) |
1 | 08/22/2017 | Lab Bend Fixture | NaN | NaN | NaN | NaN |
2 | 09/19/2017 | FTA Cards for DPS | ge healthcare | Marborough | MA | Marborough, MA |
3 | 09/26/2017 | 02140-785 - Rescue Equipment | reynolds & son | Barre | VT | Barre, VT\n(44.200603, -72.505569) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | s t paving , inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
... | ... | ... | ... | ... | ... | ... |
190 | 08/09/2017 | BGS - AHS Janitorial Services - St Albans, VT | loso's professional j.s. | South Burlington | VT | South Burlington, VT\n(44.468286, -73.171594) |
191 | 09/06/2017 | BGS - A & E Window Restoration - 133 State St | vt architects collaborative | Randolph | VT | Randolph, VT\n(43.925266, -72.665754) |
192 | 09/07/2017 | VDH - Car Seats | even flo | Cullman | AL | Cullman, AL\n(34.173753, -86.843115) |
193 | 07/24/2017 | RFP DMV Registration Renewal Forms | rr donnelley | Derry | NH | Derry, NH\n(42.881978, -71.324171) |
194 | 08/03/2017 | 02140-331 - Enclosed snowmobile trailers | perfection motorsports | Richmond | VT | Richmond, VT\n(44.405247, -72.992905) |
195 rows × 6 columns
[7]:
# Show operations in the current dataset log.
for op in db.dataset('bidders').log():
print(op.descriptor)
{'optype': 'load', 'columns': None}
{'optype': 'commit', 'columns': None}
{'optype': 'update', 'columns': ['Bidders Name'], 'name': 'lower'}
[8]:
# Add a user-defined function as a simple column-operator.
# register.eval adds a method to the object that is returned
# by db.apply that currently expects a single argument named
# 'columns' which specifies the column(s) on which the registered
# function is applied (evaluated) to create a modified dataset.
@db.register.eval('zigzag')
def zigzag_case(value):
"""Take a given string and return a string where
upper and lower cases alternate.
"""
result = ''
functions = [str.upper, str.lower]
i = 0
for c in str(value):
f = functions[i]
i = (i + 1) % 2
result += f(c)
return result
Notebook Spreadsheet UI
The following steps simulate some of the interactions that a user has with a dataset sample via the spreadsheet UI for Jupyter Notebooks.
[9]:
# Take a dataset sample of 10 rows.
db.sample(name='bidders', n=10, random_state=43)
[9]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
110 | 07/25/2017 | HE RESPIRATOR FIT TESTER | tsi inc. | SHOREVIEW | MN | SHOREVIEW, MN\n(45.081261, -93.134983) |
165 | 08/23/2017 | BGS - BGS Snow Removal - Kengar State Office B... | d tatro construction | Hyde Park | VT | Hyde Park, VT\n(44.593002, -72.61145) |
10 | 08/25/2017 | Brine Maker | gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
144 | 08/30/2017 | BGS - Snow Removal Services for Middlesex Gene... | g & n excavation | Moretown | VT | Moretown, VT\n(44.250919, -72.761246) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | s t paving , inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
15 | 08/14/2017 | 02140-779 - Firefighter Helmets | reynolds & son | South Barre | VT | South Barre, VT\n(44.177155, -72.504898) |
25 | 08/25/2017 | Brine Storage Tanks | gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
24 | 08/30/2017 | Rolled Steel Plates | chillicothe steel | Chillicothe | OH | Chillicothe, OH\n(39.331846, -82.981776) |
96 | 08/04/2017 | Ballistic Panels | executive wood products | Sullivan | MO | Sullivan, MO\n(38.213599, -91.16411) |
111 | 08/25/2017 | MIL - 02 System Design - VT Veteran's Home - B... | goldstone architect | Bennington | VT | Bennington, VT\n(42.878372, -73.19709) |
[10]:
# Capitalize values in the 'Bidders Name' columns.
db.dataset(name='bidders').update(columns='Bidders Name', func=db.library.functions().get('capitalize'))
[10]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
110 | 07/25/2017 | HE RESPIRATOR FIT TESTER | Tsi Inc. | SHOREVIEW | MN | SHOREVIEW, MN\n(45.081261, -93.134983) |
165 | 08/23/2017 | BGS - BGS Snow Removal - Kengar State Office B... | D Tatro Construction | Hyde Park | VT | Hyde Park, VT\n(44.593002, -72.61145) |
10 | 08/25/2017 | Brine Maker | Gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
144 | 08/30/2017 | BGS - Snow Removal Services for Middlesex Gene... | G & N Excavation | Moretown | VT | Moretown, VT\n(44.250919, -72.761246) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
15 | 08/14/2017 | 02140-779 - Firefighter Helmets | Reynolds & Son | South Barre | VT | South Barre, VT\n(44.177155, -72.504898) |
25 | 08/25/2017 | Brine Storage Tanks | Gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
24 | 08/30/2017 | Rolled Steel Plates | Chillicothe Steel | Chillicothe | OH | Chillicothe, OH\n(39.331846, -82.981776) |
96 | 08/04/2017 | Ballistic Panels | Executive Wood Products | Sullivan | MO | Sullivan, MO\n(38.213599, -91.16411) |
111 | 08/25/2017 | MIL - 02 System Design - VT Veteran's Home - B... | Goldstone Architect | Bennington | VT | Bennington, VT\n(42.878372, -73.19709) |
[11]:
# Apply the zig-zag function to column 'City'
db.dataset(name='bidders').update(columns='City', func=zigzag_case)
[11]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
110 | 07/25/2017 | HE RESPIRATOR FIT TESTER | Tsi Inc. | ShOrEvIeW | MN | SHOREVIEW, MN\n(45.081261, -93.134983) |
165 | 08/23/2017 | BGS - BGS Snow Removal - Kengar State Office B... | D Tatro Construction | HyDe pArK | VT | Hyde Park, VT\n(44.593002, -72.61145) |
10 | 08/25/2017 | Brine Maker | Gvm | EaSt bErLiN | PA | East Berlin, PA\n(39.937556, -76.980498) |
144 | 08/30/2017 | BGS - Snow Removal Services for Middlesex Gene... | G & N Excavation | MoReToWn | VT | Moretown, VT\n(44.250919, -72.761246) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | WaTeRbUrY | VT | Waterbury, VT\n(44.334602, -72.753189) |
15 | 08/14/2017 | 02140-779 - Firefighter Helmets | Reynolds & Son | SoUtH BaRrE | VT | South Barre, VT\n(44.177155, -72.504898) |
25 | 08/25/2017 | Brine Storage Tanks | Gvm | EaSt bErLiN | PA | East Berlin, PA\n(39.937556, -76.980498) |
24 | 08/30/2017 | Rolled Steel Plates | Chillicothe Steel | ChIlLiCoThE | OH | Chillicothe, OH\n(39.331846, -82.981776) |
96 | 08/04/2017 | Ballistic Panels | Executive Wood Products | SuLlIvAn | MO | Sullivan, MO\n(38.213599, -91.16411) |
111 | 08/25/2017 | MIL - 02 System Design - VT Veteran's Home - B... | Goldstone Architect | BeNnInGtOn | VT | Bennington, VT\n(42.878372, -73.19709) |
[12]:
# Show operations in the current dataset log (recipe). Note that
# each log entry has a unique identifier that is used to reference
# the represented dataset snapshot in checkout() and rollback()
# operations.
snapshots = list()
for op in db.dataset('bidders').log():
print('{} {}'.format(op.version, op.descriptor))
snapshots.append(op.version)
0 {'optype': 'sample', 'columns': None, 'arguments': [{'name': 'n', 'value': 10}, {'name': 'randomState', 'value': 43}]}
1 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
2 {'optype': 'update', 'columns': ['City'], 'name': 'zigzag'}
[13]:
# Print snapshot that resulted from the second operation
# in the recipe, i.e., the capitalize operation on the
# 'Bidders Name'. City names should not be zig-zag in this
# snapshot.
db.dataset('bidders').checkout(snapshots[1])
[13]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
110 | 07/25/2017 | HE RESPIRATOR FIT TESTER | Tsi Inc. | SHOREVIEW | MN | SHOREVIEW, MN\n(45.081261, -93.134983) |
165 | 08/23/2017 | BGS - BGS Snow Removal - Kengar State Office B... | D Tatro Construction | Hyde Park | VT | Hyde Park, VT\n(44.593002, -72.61145) |
10 | 08/25/2017 | Brine Maker | Gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
144 | 08/30/2017 | BGS - Snow Removal Services for Middlesex Gene... | G & N Excavation | Moretown | VT | Moretown, VT\n(44.250919, -72.761246) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
15 | 08/14/2017 | 02140-779 - Firefighter Helmets | Reynolds & Son | South Barre | VT | South Barre, VT\n(44.177155, -72.504898) |
25 | 08/25/2017 | Brine Storage Tanks | Gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
24 | 08/30/2017 | Rolled Steel Plates | Chillicothe Steel | Chillicothe | OH | Chillicothe, OH\n(39.331846, -82.981776) |
96 | 08/04/2017 | Ballistic Panels | Executive Wood Products | Sullivan | MO | Sullivan, MO\n(38.213599, -91.16411) |
111 | 08/25/2017 | MIL - 02 System Design - VT Veteran's Home - B... | Goldstone Architect | Bennington | VT | Bennington, VT\n(42.878372, -73.19709) |
[14]:
# Remove the zig-zag operation from the dataset history
# by rolling back to the previous operation.
db.dataset('bidders').rollback(snapshots[1])
[14]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
110 | 07/25/2017 | HE RESPIRATOR FIT TESTER | Tsi Inc. | SHOREVIEW | MN | SHOREVIEW, MN\n(45.081261, -93.134983) |
165 | 08/23/2017 | BGS - BGS Snow Removal - Kengar State Office B... | D Tatro Construction | Hyde Park | VT | Hyde Park, VT\n(44.593002, -72.61145) |
10 | 08/25/2017 | Brine Maker | Gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
144 | 08/30/2017 | BGS - Snow Removal Services for Middlesex Gene... | G & N Excavation | Moretown | VT | Moretown, VT\n(44.250919, -72.761246) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
15 | 08/14/2017 | 02140-779 - Firefighter Helmets | Reynolds & Son | South Barre | VT | South Barre, VT\n(44.177155, -72.504898) |
25 | 08/25/2017 | Brine Storage Tanks | Gvm | East Berlin | PA | East Berlin, PA\n(39.937556, -76.980498) |
24 | 08/30/2017 | Rolled Steel Plates | Chillicothe Steel | Chillicothe | OH | Chillicothe, OH\n(39.331846, -82.981776) |
96 | 08/04/2017 | Ballistic Panels | Executive Wood Products | Sullivan | MO | Sullivan, MO\n(38.213599, -91.16411) |
111 | 08/25/2017 | MIL - 02 System Design - VT Veteran's Home - B... | Goldstone Architect | Bennington | VT | Bennington, VT\n(42.878372, -73.19709) |
[15]:
# Show remaining operations in the current dataset log.
for op in db.dataset('bidders').log():
print('{} {}'.format(op.version, op.descriptor))
snapshots.append(op.version)
0 {'optype': 'sample', 'columns': None, 'arguments': [{'name': 'n', 'value': 10}, {'name': 'randomState', 'value': 43}]}
1 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
[16]:
# Apply changes to the full dataset.
db.checkout('bidders', commit=True)
[16]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
0 | 07/26/2017 | HE - NARCAN 4MG 7.25 | Adapt Pharma | RADNOR | PA | RADNOR, PA\n(40.038043, -75.344449) |
1 | 08/22/2017 | Lab Bend Fixture | NaN | NaN | NaN | NaN |
2 | 09/19/2017 | FTA Cards for DPS | Ge Healthcare | Marborough | MA | Marborough, MA |
3 | 09/26/2017 | 02140-785 - Rescue Equipment | Reynolds & Son | Barre | VT | Barre, VT\n(44.200603, -72.505569) |
4 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | S T Paving , Inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
... | ... | ... | ... | ... | ... | ... |
190 | 08/09/2017 | BGS - AHS Janitorial Services - St Albans, VT | Loso'S Professional J.S. | South Burlington | VT | South Burlington, VT\n(44.468286, -73.171594) |
191 | 09/06/2017 | BGS - A & E Window Restoration - 133 State St | Vt Architects Collaborative | Randolph | VT | Randolph, VT\n(43.925266, -72.665754) |
192 | 09/07/2017 | VDH - Car Seats | Even Flo | Cullman | AL | Cullman, AL\n(34.173753, -86.843115) |
193 | 07/24/2017 | RFP DMV Registration Renewal Forms | Rr Donnelley | Derry | NH | Derry, NH\n(42.881978, -71.324171) |
194 | 08/03/2017 | 02140-331 - Enclosed snowmobile trailers | Perfection Motorsports | Richmond | VT | Richmond, VT\n(44.405247, -72.992905) |
195 rows × 6 columns
[17]:
# Show operations in the resulting dataset log.
for op in db.dataset('bidders').log():
print(op.version, op.descriptor)
0 {'optype': 'load', 'columns': None}
1 {'optype': 'commit', 'columns': None}
2 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'lower'}
3 {'optype': 'update', 'columns': ['Bidders Name'], 'name': 'capitalize'}
Rollback Changes in Persistent Archive
[18]:
# Rollback changes in the persistent archive to version 2.
df = db.rollback('bidders', version=2)
df
[18]:
Date | Title or Project | Bidders Name | City | State | Location 1 | |
---|---|---|---|---|---|---|
143 | 07/26/2017 | HE - NARCAN 4MG 7.25 | adapt pharma | RADNOR | PA | RADNOR, PA\n(40.038043, -75.344449) |
145 | 08/22/2017 | Lab Bend Fixture | None | None | None | None |
138 | 09/19/2017 | FTA Cards for DPS | ge healthcare | Marborough | MA | Marborough, MA |
24 | 09/26/2017 | 02140-785 - Rescue Equipment | reynolds & son | Barre | VT | Barre, VT\n(44.200603, -72.505569) |
59 | 09/20/2017 | BGS - 170078 Trush Parking Lot Paving Repairs ... | s t paving , inc | Waterbury | VT | Waterbury, VT\n(44.334602, -72.753189) |
... | ... | ... | ... | ... | ... | ... |
64 | 08/09/2017 | BGS - AHS Janitorial Services - St Albans, VT | loso's professional j.s. | South Burlington | VT | South Burlington, VT\n(44.468286, -73.171594) |
62 | 09/06/2017 | BGS - A & E Window Restoration - 133 State St | vt architects collaborative | Randolph | VT | Randolph, VT\n(43.925266, -72.665754) |
179 | 09/07/2017 | VDH - Car Seats | even flo | Cullman | AL | Cullman, AL\n(34.173753, -86.843115) |
159 | 07/24/2017 | RFP DMV Registration Renewal Forms | rr donnelley | Derry | NH | Derry, NH\n(42.881978, -71.324171) |
3 | 08/03/2017 | 02140-331 - Enclosed snowmobile trailers | perfection motorsports | Richmond | VT | Richmond, VT\n(44.405247, -72.992905) |
195 rows × 6 columns