Misspellings in Country Names
This notebook contains an example that uses the list of country names provided by the restcountries project to identify potential misspelled names in the ‘ITU ICT Development Index (IDI)’.
Download Country Names Masterdata
[1]:
# Create a new local archive for the countries dataset. All archive data will be stored in
# a sub-folder 'data'.
datadir = './data'
[2]:
# Download the current listing of country names in the world.
from openclean.data.refdata import RefStore
import openclean.data.archive.base as masterdata
refstore = RefStore(basedir=datadir)
refstore.download('restcountries.eu')
countries = refstore.load('restcountries.eu').df()
print(countries.head())
archive = masterdata.create('restcountries', source=countries, primary_key=['alpha3Code'], replace=True)
name alpha2Code alpha3Code capital region subregion
0 Afghanistan AF AFG Kabul Asia Southern Asia
1 Åland Islands AX ALA Mariehamn Europe Northern Europe
2 Albania AL ALB Tirana Europe Southern Europe
3 Algeria DZ DZA Algiers Africa Northern Africa
4 American Samoa AS ASM Pago Pago Oceania Polynesia
Identify Country Name Outliers in ITU ICT Development Index (IDI)
[3]:
# Download the latest version of the 'ITU ICT Development Index (IDI)' from Socrata.
import os
from openclean.data.source.socrata import Socrata
idi = Socrata().dataset('3bxy-wfk9').load()
idi.head()
[3]:
year | country_id | country_name | sub_index | value_type | value | |
---|---|---|---|---|---|---|
0 | 2015 | KOR | Korea (Rep.) | NaN | rank | 1.0 |
1 | 2015 | DNK | Denmark | NaN | rank | 2.0 |
2 | 2015 | ISL | Iceland | NaN | rank | 3.0 |
3 | 2015 | GBR | United Kingdom | NaN | rank | 4.0 |
4 | 2015 | SWE | Sweden | NaN | rank | 5.0 |
[4]:
# Use the restcountries country names as the ground truth domain to identify names in the
# IDI that do not occur in the grounth truth.
from openclean.profiling.anomalies.domain import domain_outliers
unknown_countries = domain_outliers(idi, 'country_name', countries['name'])
unknown_countries
[4]:
['Korea (Rep.)',
'United Kingdom',
'Hong Kong, China',
'United States',
'Macao, China',
'TFYR Macedonia',
'Antigua & Barbuda',
'St. Kitts and Nevis',
'Moldova',
'St. Vincent and the Grenadines',
'Trinidad & Tobago',
'Venezuela',
'St. Lucia',
'Iran (I.R.)',
'Cape Verde',
'Dominican Rep.',
'Bolivia',
'Syria',
"C™te d'Ivoire",
'Lao P.D.R.',
'Congo (Rep.)',
'Tanzania',
'Congo (Dem. Rep.)']
Repair Country Name Outliers in ITU ICT Development Index (IDI)
[5]:
# Use the restcountries country codes as the ground truth domain to identify country identifier in the
# IDI that do not occur in the grounth truth.
unknown_codes = domain_outliers(idi, 'country_id', countries['alpha3Code'])
unknown_codes
[5]:
[]
[6]:
# User the country codes in the IDI datasets to create a mapping from the identified outliers in IDI to the
# country name in the ground truth dataset.
import pandas as pd
from openclean.data.util import repair_mapping
# Create a dictionary that maps the 'unknown' country names from the IDI dataset to the respective
# ground truth name based on the country code.
df = pd.merge(idi, countries, left_on='country_id', right_on='alpha3Code')
lookup = repair_mapping(df, key='country_name', value='name')
print('Replace {} with {}\n------------------')
for key, value in lookup.items():
print("'{}' -> '{}'".format(key, value))
Replace {} with {}
------------------
'Korea (Rep.)' -> 'Korea (Republic of)'
'United Kingdom' -> 'United Kingdom of Great Britain and Northern Ireland'
'Hong Kong, China' -> 'Hong Kong'
'United States' -> 'United States of America'
'Macao, China' -> 'Macao'
'TFYR Macedonia' -> 'Macedonia (the former Yugoslav Republic of)'
'Antigua & Barbuda' -> 'Antigua and Barbuda'
'St. Kitts and Nevis' -> 'Saint Kitts and Nevis'
'Moldova' -> 'Moldova (Republic of)'
'St. Vincent and the Grenadines' -> 'Saint Vincent and the Grenadines'
'Trinidad & Tobago' -> 'Trinidad and Tobago'
'Venezuela' -> 'Venezuela (Bolivarian Republic of)'
'St. Lucia' -> 'Saint Lucia'
'Iran (I.R.)' -> 'Iran (Islamic Republic of)'
'Cape Verde' -> 'Cabo Verde'
'Dominican Rep.' -> 'Dominican Republic'
'Bolivia' -> 'Bolivia (Plurinational State of)'
'Syria' -> 'Syrian Arab Republic'
'C™te d'Ivoire' -> 'Côte d'Ivoire'
'Lao P.D.R.' -> 'Lao People's Democratic Republic'
'Congo (Rep.)' -> 'Congo'
'Tanzania' -> 'Tanzania, United Republic of'
'Congo (Dem. Rep.)' -> 'Congo (Democratic Republic of the)'
[7]:
# Use the mapping to replace unknown country names in IDI with their respective
# ground truth value.
from openclean.function.eval.domain import Lookup
from openclean.operator.transform.update import update
idi_repaired = update(idi, 'country_name', Lookup('country_name', lookup))
[8]:
idi_repaired.head()
[8]:
year | country_id | country_name | sub_index | value_type | value | |
---|---|---|---|---|---|---|
0 | 2015 | KOR | Korea (Republic of) | NaN | rank | 1.0 |
1 | 2015 | DNK | Denmark | NaN | rank | 2.0 |
2 | 2015 | ISL | Iceland | NaN | rank | 3.0 |
3 | 2015 | GBR | United Kingdom of Great Britain and Northern I... | NaN | rank | 4.0 |
4 | 2015 | SWE | Sweden | NaN | rank | 5.0 |