# Parsing explanations for IPC classes

## Loading data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 5400)
pd.set_option('display.max_rows', 400)

In [4]:
src_dir = "./data/"

codes = list(map(chr, range(ord('A'), ord('H') + 1)))

df = pd.DataFrame()
for code in codes:
    filename = src_dir + "EN_ipc_title_list_20190101/" + "EN_ipc_section_" + code + "_title_list_20190101" + ".txt"
    part = pd.read_csv(filename, sep='\t', lineterminator='\n', header=None, names=["code", "explanation"], na_values="")   
    df = df.append(part)

print(df.shape)
df.head()

(75830, 2)


Unnamed: 0,code,explanation
0,A,HUMAN NECESSITIES
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
3,A01B0001000000,Hand tools (edge trimmers for lawns A01G000306...
4,A01B0001020000,Spades; Shovels


In [5]:
df.dropna(inplace=True)
print(df.shape)

(75827, 2)


In [6]:
df.drop_duplicates(subset="code", keep="last", inplace=True)
print(df.shape)
df.head()

(75287, 2)


Unnamed: 0,code,explanation
0,A,HUMAN NECESSITIES
1,A01,AGRICULTURE; FORESTRY; ANIMAL HUSBANDRY; HUNTI...
2,A01B,SOIL WORKING IN AGRICULTURE OR FORESTRY; PARTS...
3,A01B0001000000,Hand tools (edge trimmers for lawns A01G000306...
4,A01B0001020000,Spades; Shovels


In [7]:
def remove_parentheses(text):
    pos = str(text).find(" (")
    if pos == -1:
        return text
    else:
        return text[0:pos]

def capitalize_uppercase(text):
    if not text[0].islower():
        text = '; '.join(i.capitalize() for i in text.split('; '))
    return text;
    
df["explanation"] = df["explanation"].apply(remove_parentheses)
df["explanation"] = df["explanation"].apply(capitalize_uppercase)

df.head()

Unnamed: 0,code,explanation
0,A,Human necessities
1,A01,Agriculture; Forestry; Animal husbandry; Hunti...
2,A01B,Soil working in agriculture or forestry; Parts...
3,A01B0001000000,Hand tools
4,A01B0001020000,Spades; Shovels


In [8]:
def parse_code(code):
    if len(code) < 14:
        return code
    else:
        code = code[:11]
        if code[10] == "0":
            code = code[:10]
        code = code[:4] + code[6:]
        if code[4] == "0":
            code = code[:4] + code[5:]
            code = code[:5] + "/" + code[5:]
        else:
            code = code[:6] + "/" + code[6:]
    return code

df["code"] = df["code"].apply(parse_code)
df.head(20)

Unnamed: 0,code,explanation
0,A,Human necessities
1,A01,Agriculture; Forestry; Animal husbandry; Hunti...
2,A01B,Soil working in agriculture or forestry; Parts...
3,A01B1/00,Hand tools
4,A01B1/02,Spades; Shovels
5,A01B1/04,with teeth
6,A01B1/06,Hoes; Hand cultivators
7,A01B1/08,with a single blade
8,A01B1/10,with two or more blades
9,A01B1/12,with blades provided with teeth


In [9]:
def duplicate_classes(row):
    #print(row[0])
    if row[0][-2:] == "00":
        new_row = row.to_frame().transpose()
        new_row.iloc[0] = row[0][:-3]      
        new_row.iloc[0,1] = row[1]      
        row = new_row.append(row)
    else:
        row = row.to_frame().transpose()        
    #row.set_index("code", inplace=True)
    return row

duplicate_subclass = pd.DataFrame()

for i, row in df.iterrows():
    duplicate_subclass = duplicate_subclass.append(duplicate_classes(row))

duplicate_subclass.set_index("code", inplace=True)
duplicate_subclass.head(20)

Unnamed: 0_level_0,explanation
code,Unnamed: 1_level_1
A,Human necessities
A01,Agriculture; Forestry; Animal husbandry; Hunti...
A01B,Soil working in agriculture or forestry; Parts...
A01B1,Hand tools
A01B1/00,Hand tools
A01B1/02,Spades; Shovels
A01B1/04,with teeth
A01B1/06,Hoes; Hand cultivators
A01B1/08,with a single blade
A01B1/10,with two or more blades


In [10]:
df = duplicate_subclass

In [11]:
d = df.to_dict("index")

In [12]:
import json

with open("../demo/data/explanations.json", 'w') as outfile:
    json.dump(d, outfile)