Misspellings in Country Names

This notebook contains an example that uses the list of country names provided by the restcountries project to identify potential misspelled names in the ‘ITU ICT Development Index (IDI)’.

Download Country Names Masterdata

[1]:
# Create a new local archive for the countries dataset. All archive data will be stored in
# a sub-folder 'data'.

datadir = './data'
[2]:
# Download the current listing of country names in the world.

from openclean.data.refdata import RefStore

import openclean.data.archive.base as masterdata

refstore = RefStore(basedir=datadir)
refstore.download('restcountries.eu')
countries = refstore.load('restcountries.eu').df()

print(countries.head())

archive = masterdata.create('restcountries', source=countries, primary_key=['alpha3Code'], replace=True)
             name alpha2Code alpha3Code    capital   region        subregion
0     Afghanistan         AF        AFG      Kabul     Asia    Southern Asia
1   Åland Islands         AX        ALA  Mariehamn   Europe  Northern Europe
2         Albania         AL        ALB     Tirana   Europe  Southern Europe
3         Algeria         DZ        DZA    Algiers   Africa  Northern Africa
4  American Samoa         AS        ASM  Pago Pago  Oceania        Polynesia

Identify Country Name Outliers in ITU ICT Development Index (IDI)

[3]:
# Download the latest version of the 'ITU ICT Development Index (IDI)' from Socrata.

import os

from openclean.data.source.socrata import Socrata

idi = Socrata().dataset('3bxy-wfk9').load()
idi.head()
[3]:
year country_id country_name sub_index value_type value
0 2015 KOR Korea (Rep.) NaN rank 1.0
1 2015 DNK Denmark NaN rank 2.0
2 2015 ISL Iceland NaN rank 3.0
3 2015 GBR United Kingdom NaN rank 4.0
4 2015 SWE Sweden NaN rank 5.0
[4]:
# Use the restcountries country names as the ground truth domain to identify names in the
# IDI that do not occur in the grounth truth.

from openclean.profiling.anomalies.domain import domain_outliers

unknown_countries = domain_outliers(idi, 'country_name', countries['name'])

unknown_countries
[4]:
['Korea (Rep.)',
 'United Kingdom',
 'Hong Kong, China',
 'United States',
 'Macao, China',
 'TFYR Macedonia',
 'Antigua & Barbuda',
 'St. Kitts and Nevis',
 'Moldova',
 'St. Vincent and the Grenadines',
 'Trinidad & Tobago',
 'Venezuela',
 'St. Lucia',
 'Iran (I.R.)',
 'Cape Verde',
 'Dominican Rep.',
 'Bolivia',
 'Syria',
 "C™te d'Ivoire",
 'Lao P.D.R.',
 'Congo (Rep.)',
 'Tanzania',
 'Congo (Dem. Rep.)']

Repair Country Name Outliers in ITU ICT Development Index (IDI)

[5]:
# Use the restcountries country codes as the ground truth domain to identify country identifier in the
# IDI that do not occur in the grounth truth.

unknown_codes = domain_outliers(idi, 'country_id', countries['alpha3Code'])

unknown_codes
[5]:
[]
[6]:
# User the country codes in the IDI datasets to create a mapping from the identified outliers in IDI to the
# country name in the ground truth dataset.

import pandas as pd

from openclean.data.util import repair_mapping

# Create a dictionary that maps the 'unknown' country names from the IDI dataset to the respective
# ground truth name based on the country code.

df = pd.merge(idi, countries, left_on='country_id', right_on='alpha3Code')
lookup = repair_mapping(df, key='country_name', value='name')

print('Replace {} with {}\n------------------')
for key, value in lookup.items():
    print("'{}' -> '{}'".format(key, value))
Replace {} with {}
------------------
'Korea (Rep.)' -> 'Korea (Republic of)'
'United Kingdom' -> 'United Kingdom of Great Britain and Northern Ireland'
'Hong Kong, China' -> 'Hong Kong'
'United States' -> 'United States of America'
'Macao, China' -> 'Macao'
'TFYR Macedonia' -> 'Macedonia (the former Yugoslav Republic of)'
'Antigua & Barbuda' -> 'Antigua and Barbuda'
'St. Kitts and Nevis' -> 'Saint Kitts and Nevis'
'Moldova' -> 'Moldova (Republic of)'
'St. Vincent and the Grenadines' -> 'Saint Vincent and the Grenadines'
'Trinidad & Tobago' -> 'Trinidad and Tobago'
'Venezuela' -> 'Venezuela (Bolivarian Republic of)'
'St. Lucia' -> 'Saint Lucia'
'Iran (I.R.)' -> 'Iran (Islamic Republic of)'
'Cape Verde' -> 'Cabo Verde'
'Dominican Rep.' -> 'Dominican Republic'
'Bolivia' -> 'Bolivia (Plurinational State of)'
'Syria' -> 'Syrian Arab Republic'
'C™te d'Ivoire' -> 'Côte d'Ivoire'
'Lao P.D.R.' -> 'Lao People's Democratic Republic'
'Congo (Rep.)' -> 'Congo'
'Tanzania' -> 'Tanzania, United Republic of'
'Congo (Dem. Rep.)' -> 'Congo (Democratic Republic of the)'
[7]:
# Use the mapping to replace unknown country names in IDI with their respective
# ground truth value.

from openclean.function.eval.domain import Lookup
from openclean.operator.transform.update import update

idi_repaired = update(idi, 'country_name', Lookup('country_name', lookup))
[8]:
idi_repaired.head()
[8]:
year country_id country_name sub_index value_type value
0 2015 KOR Korea (Republic of) NaN rank 1.0
1 2015 DNK Denmark NaN rank 2.0
2 2015 ISL Iceland NaN rank 3.0
3 2015 GBR United Kingdom of Great Britain and Northern I... NaN rank 4.0
4 2015 SWE Sweden NaN rank 5.0