# Creating different kinds of hierarchies for sunburst graph

## Loading data

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%matplotlib inline

In [3]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 5400)
#pd.set_option('display.max_rows', 400)

In [4]:
import pickle 

#seed_name = 'hair_dryer'
#seed_name = 'video_codec'
#seed_name = 'diesel'
seed_name = "contact_lens"
#seed_name = "contact_lens_us_c"
#seed_name = "3d_printer"

src_dir = "../demo/data/"

with open("./data/" + seed_name + '/terms_attributes.pkl', 'rb') as infile:
    df = pickle.load(infile)
df.head()

Unnamed: 0,pub_num,publication_number,country_code,family_id,priority_date,title_text,abstract_text,claims_text,tokens,year,month,day,ipc_classes,first_letters,three_letters,four_letters,before_slash,references,referenced_by,raw_assignees,assignees,terms
0,4104187,US-4104187-A,US,24713901,19760412,Composition and method treating soft contact l...,Method and compositions are provided for eleva...,What is claimed is: \n \n 1. A soft ...,"[composition, treating, soft, contact, lenses,...",1976,4,12,"[A61L12/14, C11D1/52, C11D1/66, C11D1/90, C11D...","[A, C, G]","[A61, C11, G02]","[A61L, C11D, G02C]","[A61L12, C11D1, C11D10, C11D17, C11D3, G02C13,...","[US-3171752-A, US-4013576-A, US-2089212-A, US-...","[US-4259202-A, US-2006241001-A1, US-4533399-A,...",[BARNES HIND PHARM INC],[BARNES HIND INC],"[physiologically acceptable, disinfecting clea..."
1,6000534,US-6000534-A,US,24806490,19960816,Contact lens disinfecting device and disinfect...,The present invention provides a device for di...,I claim: \n \n 1. A device for disin...,"[contact, lens, disinfecting, device, disinfec...",1996,8,16,"[A45C11/00, A61L12/12, A61L2/18, G02C13/00]","[A, G]","[A45, A61, G02]","[A45C, A61L, G02C]","[A45C11, A61L12, A61L2, G02C13]","[US-4750610-A, US-4889693-A, US-5143104-A, US-...",[],[ALLERGAN SALES INC],[ADVANCED CORNEAL SYSTEMS INC],"[lens holding, cap, internal surface, containe..."
2,7281794,US-7281794-B2,US,25536666,20011116,Disposable ophthalmic lens,"An ophthalmic lens with high definition, wide ...",What is claimed and desired to be secured by L...,"[disposable, ophthalmic, lens, ophthalmic, len...",2001,11,16,"[A61B3/125, G02C7/04]","[A, G]","[A61, G02]","[A61B, G02C]","[A61B3, G02C7]","[US-6471396-B2, US-4921205-A, US-4613379-A, US...",[],[HEACOCK GREGORY L],[HEACOCK GREGORY L],"[image lens, lens image, holder, lens mounted,..."
3,8693104,US-8693104-B2,US,44651943,20100824,Lens with compound linear-convex meniscus wall,The present invention relates generally to an ...,The invention claimed is: \n \n 1. A...,"[lens, compound, linear, convex, meniscus, wal...",2010,8,24,"[G02B1/06, G02B3/12]",[G],[G02],[G02B],"[G02B1, G02B3]","[WO-2008062067-A1, US-7724444-B2, US-7311398-B...",[],"[KERNICK EDWARD R, RIALL JAMES DANIEL, JOHNSON...","[SNOOK SHARIKA, RIALL JAMES D, PUGH RANDALL B,...","[curve lens, meniscus, optical lens, electrica..."
4,8215770,US-8215770-B2,US,40337739,20070223,Ophthalmic dynamic aperture,Embodiments of the present invention relate to...,"1. An ophthalmic device, comprising:\n an elec...","[ophthalmic, dynamic, aperture, embodiments, e...",2007,2,23,"[A61F2/16, G02C7/04]","[A, G]","[A61, G02]","[A61F, G02C]","[A61F2, G02C7]","[US-6956682-B2, US-5608567-A, US-5963300-A, US...","[US-9535263-B2, US-9201250-B2, US-10203522-B2,...","[HADDOCK JOSHUA N, VAN HEUGTEN ANTHONY, KOKONA...","[VAN HEUGTEN ANTHONY, HUNKELER JOHN, E A OPHTH...","[electro active, electro, aperture, ophthalmic..."


## Computing hierarchies with different levels

### Helper functions

In [5]:
def unite_arrays(series):
    l = [item for sublist in series.values for item in sublist]
    myset = set(l)
    l = list(myset)
    return l

In [6]:
def get_unique_values(df, col, t, starts_with = ''):    
    
    if t == 'list' or t == 'code':
        values = [item for sublist in df[col].values for item in sublist]
        values_set = set(values)
        values = list(values_set)
        values_filtered = [item for item in values if item.startswith(starts_with)]
    elif t == 'value':
        values_filtered = pd.unique(df[col])    
    else:
        raise ValueError('Allowed types: list, value, code')
        
    return values_filtered

In [7]:
def get_subgroup(df, col, t, v):    
    
    if t == 'list' or t == 'code':
        subgroup = df.loc[df[col].apply(lambda x: v in x)] 
    elif t == 'value':
        subgroup = df.loc[df[col] == v] 
    else:
        raise ValueError('Allowed types: list, value, code')
        
    return subgroup

In [8]:
def create_hierarchy(df, columns, types):
    if (len(columns) != len(types)):
        raise ValueError('You must specify a type for each column')
       
    print("Creating hierarchy with columns: " + ', '.join(columns) + ' and types: ' + ', '.join(types) + '\n')
    
    hierarchy = pd.DataFrame(columns = ["id", "name", "ids", "parent", 'column'])
        
    parents = {"root": ['0_root']}
    parent_type = 'value'
    parent_col = 'unused'
    
    level_stack = []
    filtered_stack = []    
    unique_values = []
    
    print('Forward pass. Grouping\n')
    
    for i in range(len(columns)):
        t = types[i]
        col = columns[i]
        
        unique_values.append({})
        
        print(str(i) + ': Splitting by ' + t + ' attribute ' + col + '\n')   
        
        level = pd.DataFrame(columns=hierarchy.columns)
        filtered_stack_level = {}
        
        #print("Parents:\n")
        #print(parents)    
            
        for parent_key, parent_ids in parents.items():
            for parent_id in parent_ids:
                if parent_key == 'root':
                    group = df
                else:
                    group = filtered_stack[i-1][parent_id] # get_by_parent(df, parent_col, p, parent_type)                          

                #print("Grouping within parent: " + parent_id + " in parent column " + parent_col + " with type: " + parent_type)

                if t == 'code' and parent_type == 'code':
                    values = get_unique_values(group, col, t, parent_key)
                else:
                    values = get_unique_values(group, col, t)

                #print('Unique values: '+ str(len(values)) + '. Some values: ' + ', '.join(values[:10]) + '\n')

                subgroup_values = {}

                for v in values:
                    if v in unique_values[i]:
                        modified_value = str(len(unique_values[i][v])) + "_" + v                   
                        unique_values[i][v].append(modified_value)
                    else:
                        modified_value = "0_" + v
                        unique_values[i][v] = [modified_value]
                    subgroup_values[v] = modified_value

                #print("\nSubgroup values:\n")
                #print(subgroup_values)

                for key, value in subgroup_values.items(): #there should be only one value per key
                                        
                    subgroup = get_subgroup(group, col, t, key)
                    filtered_stack_level[value] = subgroup

                    ids = subgroup["publication_number"].values;
                    level = level.append(pd.DataFrame([[value, key, ids, parent_id, col]], columns = hierarchy.columns))
                
        parents = unique_values[i]
        parent_type = t
        parent_col = col
        
        #print("\nUnique values level " + str(i) + ":\n")
        #print(unique_values[i])

        #print("Filtered stack level:\n")
        #print(filtered_stack_level)
        
        level_stack.append(level)
        filtered_stack.append(filtered_stack_level)
        
        
    print('Backward pass. Aggregating the subgroup counts\n')
    
    for i in reversed(range(len(columns))):
        
        #print('i: '+ str(i))
        
        level = level_stack[i]
        
        #print('Current level:')
        #print(level.head())
        
        level_grouped = level.groupby(['parent']).agg({"ids": unite_arrays})
                
        for index, row in level_grouped.iterrows():
            parent_name = row.name
            ids = row["ids"]

            if i == 0:
                print("parent of " + parent_name + " is null")
                
                d = {"id": ["0_root"], "name": ["root"], "ids": [ids], "parent": [""], 'column':[i]}
                previous_level = pd.DataFrame(data = d)
                print("Appended root node: ")
                print(previous_level)
                
                hierarchy = hierarchy.append(previous_level)
                                        
        hierarchy = hierarchy.append(level)
               
    return hierarchy

### Dividing by assignee

In [9]:
hierarchy = create_hierarchy(df, ['assignees'], ['list'])

Creating hierarchy with columns: assignees and types: list

Forward pass. Grouping

0: Splitting by list attribute assignees

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [10]:
print(hierarchy.head())
hierarchy.tail()

                      id                 name  \
0                 0_root                 root   
0     0_SMITH EARL L III     SMITH EARL L III   
0  0_CAMPANELLI GIOVANNI  CAMPANELLI GIOVANNI   
0     0_SCHMIEDER ROLAND     SCHMIEDER ROLAND   
0             0_JRS CORP             JRS CORP   

                                                 ids  parent     column  
0  [US-7040756-B2, US-2007091259-A1, US-7111938-B...                  0  
0  [US-8992010-B2, US-2007296916-A1, US-201232033...  0_root  assignees  
0                                 [US-2011147958-A1]  0_root  assignees  
0  [US-2006077341-A1, US-2012026459-A1, US-903917...  0_root  assignees  
0                                 [US-2017058237-A1]  0_root  assignees  


Unnamed: 0,id,name,ids,parent,column
0,0_UNIV TEXAS,UNIV TEXAS,[US-6329024-B1],0_root,assignees
0,0_SCHAEFER ROLF,SCHAEFER ROLF,[US-2009232871-A1],0_root,assignees
0,0_EZRIELEV ROBERT I,EZRIELEV ROBERT I,[US-4171878-A],0_root,assignees
0,0_PRUITT JOHN DALLAS,PRUITT JOHN DALLAS,"[US-8480227-B2, US-2012029111-A1, US-200905916...",0_root,assignees
0,0_TA CHRISTOPHER,TA CHRISTOPHER,"[US-8322852-B2, US-2009020683-A1, US-201116624...",0_root,assignees


In [11]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_assignee.json", orient="records")

### Dividing by country

In [12]:
hierarchy = create_hierarchy(df, ['country_code'], ['value'])

Creating hierarchy with columns: country_code and types: value

Forward pass. Grouping

0: Splitting by value attribute country_code

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [13]:
print(hierarchy.head())
hierarchy.tail()

       id  name                                                ids  parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...           
0    0_US    US  [US-4104187-A, US-6000534-A, US-7281794-B2, US...  0_root   

         column  
0             0  
0  country_code  


Unnamed: 0,id,name,ids,parent,column
0,0_root,root,"[US-7040756-B2, US-2007091259-A1, US-7111938-B...",,0
0,0_US,US,"[US-4104187-A, US-6000534-A, US-7281794-B2, US...",0_root,country_code


In [14]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_country.json", orient="records")

### Dividing by all levels of IPC hierarchy: A, A21, A21B, A21B 11/04

In [15]:
hierarchy = create_hierarchy(df, ['first_letters', 'three_letters', 'four_letters', 'before_slash', 'ipc_classes'], ['code', 'code', 'code', 'code','code'])

Creating hierarchy with columns: first_letters, three_letters, four_letters, before_slash, ipc_classes and types: code, code, code, code, code

Forward pass. Grouping

0: Splitting by code attribute first_letters

1: Splitting by code attribute three_letters

2: Splitting by code attribute four_letters

3: Splitting by code attribute before_slash

4: Splitting by code attribute ipc_classes

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [16]:
print(hierarchy.head())
hierarchy.tail()

             id        name  \
0  0_D06M15/643  D06M15/643   
0    0_D06P3/58    D06P3/58   
0    0_D06P3/34    D06P3/34   
0    0_D06P3/00    D06P3/00   
0    0_D06P5/02    D06P5/02   

                                                 ids    parent       column  
0                                    [US-9459378-B2]  0_D06M15  ipc_classes  
0                                     [US-6149692-A]   0_D06P3  ipc_classes  
0  [US-5292350-A, US-5534038-A, US-5151106-A, US-...   0_D06P3  ipc_classes  
0  [US-4981487-A, US-5244470-A, US-4494954-A, US-...   0_D06P3  ipc_classes  
0               [US-2013083286-A1, US-2013083287-A1]   0_D06P5  ipc_classes  


Unnamed: 0,id,name,ids,parent,column
0,0_B,B,"[US-10194724-B2, US-2005258408-A1, US-5942558-...",0_root,first_letters
0,0_A,A,"[US-4104187-A, US-6000534-A, US-7281794-B2, US...",0_root,first_letters
0,0_E,E,"[US-2012145942-A1, US-8329763-B2, US-4609493-A]",0_root,first_letters
0,0_F,F,"[US-4707343-A, US-5340399-A, US-5776999-A, US-...",0_root,first_letters
0,0_C,C,"[US-4104187-A, US-5942558-A, US-2009096985-A1,...",0_root,first_letters


In [17]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_ipc_classes.json", orient="records")

### Dividing by assignee, then by country

In [18]:
hierarchy = create_hierarchy(df, ['assignees', 'country_code'], ['list', 'value'])

Creating hierarchy with columns: assignees, country_code and types: list, value

Forward pass. Grouping

0: Splitting by list attribute assignees

1: Splitting by value attribute country_code

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [19]:
print(hierarchy.head())
hierarchy.tail()

     id name                                                ids  \
0  0_US   US  [US-8992010-B2, US-2007296916-A1, US-201232033...   
0  1_US   US                                 [US-2011147958-A1]   
0  2_US   US  [US-2006077341-A1, US-2012026459-A1, US-903917...   
0  3_US   US                                 [US-2017058237-A1]   
0  4_US   US                                     [US-3933411-A]   

                  parent        column  
0     0_SMITH EARL L III  country_code  
0  0_CAMPANELLI GIOVANNI  country_code  
0     0_SCHMIEDER ROLAND  country_code  
0             0_JRS CORP  country_code  
0      0_WINNER ALBERT E  country_code  


Unnamed: 0,id,name,ids,parent,column
0,0_UNIV TEXAS,UNIV TEXAS,[US-6329024-B1],0_root,assignees
0,0_SCHAEFER ROLF,SCHAEFER ROLF,[US-2009232871-A1],0_root,assignees
0,0_EZRIELEV ROBERT I,EZRIELEV ROBERT I,[US-4171878-A],0_root,assignees
0,0_PRUITT JOHN DALLAS,PRUITT JOHN DALLAS,"[US-8480227-B2, US-2012029111-A1, US-200905916...",0_root,assignees
0,0_TA CHRISTOPHER,TA CHRISTOPHER,"[US-8322852-B2, US-2009020683-A1, US-201116624...",0_root,assignees


In [20]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_assignee_country.json", orient="records")

### Dividing by assignee and then by upper levels of IPC hierarchy: A, A21, A21N 

In [21]:
hierarchy = create_hierarchy(df, ['assignees', 'first_letters', 'three_letters', 'four_letters' ], ['list', 'code', 'code', 'code'])

Creating hierarchy with columns: assignees, first_letters, three_letters, four_letters and types: list, code, code, code

Forward pass. Grouping

0: Splitting by list attribute assignees

1: Splitting by code attribute first_letters

2: Splitting by code attribute three_letters

3: Splitting by code attribute four_letters

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [22]:
print(hierarchy.head())
hierarchy.tail()

       id  name                                                ids parent  \
0  0_G02C  G02C  [US-8992010-B2, US-2007296916-A1, US-201232033...  0_G02   
0  0_G02B  G02B                                 [US-2011147958-A1]  1_G02   
0  1_G02C  G02C  [US-2006077341-A1, US-2012026459-A1, US-903917...  2_G02   
0  1_G02B  G02B                                    [US-9039173-B2]  2_G02   
0  2_G02B  G02B                                 [US-2017058237-A1]  3_G02   

         column  
0  four_letters  
0  four_letters  
0  four_letters  
0  four_letters  
0  four_letters  


Unnamed: 0,id,name,ids,parent,column
0,0_UNIV TEXAS,UNIV TEXAS,[US-6329024-B1],0_root,assignees
0,0_SCHAEFER ROLF,SCHAEFER ROLF,[US-2009232871-A1],0_root,assignees
0,0_EZRIELEV ROBERT I,EZRIELEV ROBERT I,[US-4171878-A],0_root,assignees
0,0_PRUITT JOHN DALLAS,PRUITT JOHN DALLAS,"[US-8480227-B2, US-2012029111-A1, US-200905916...",0_root,assignees
0,0_TA CHRISTOPHER,TA CHRISTOPHER,"[US-8322852-B2, US-2009020683-A1, US-201116624...",0_root,assignees


In [23]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_assignee_ipc_classes.json", orient="records")

### Dividing by country, then by assignee

In [24]:
hierarchy = create_hierarchy(df, ['country_code', 'assignees'], ['value', 'list'])

Creating hierarchy with columns: country_code, assignees and types: value, list

Forward pass. Grouping

0: Splitting by value attribute country_code

1: Splitting by list attribute assignees

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [25]:
print(hierarchy.head())
hierarchy.tail()

                      id                 name  \
0     0_SMITH EARL L III     SMITH EARL L III   
0  0_CAMPANELLI GIOVANNI  CAMPANELLI GIOVANNI   
0     0_SCHMIEDER ROLAND     SCHMIEDER ROLAND   
0             0_JRS CORP             JRS CORP   
0      0_WINNER ALBERT E      WINNER ALBERT E   

                                                 ids parent     column  
0  [US-8992010-B2, US-2007296916-A1, US-201232033...   0_US  assignees  
0                                 [US-2011147958-A1]   0_US  assignees  
0  [US-2006077341-A1, US-2012026459-A1, US-903917...   0_US  assignees  
0                                 [US-2017058237-A1]   0_US  assignees  
0                                     [US-3933411-A]   0_US  assignees  


Unnamed: 0,id,name,ids,parent,column
0,0_EZRIELEV ROBERT I,EZRIELEV ROBERT I,[US-4171878-A],0_US,assignees
0,0_PRUITT JOHN DALLAS,PRUITT JOHN DALLAS,"[US-8480227-B2, US-2012029111-A1, US-200905916...",0_US,assignees
0,0_TA CHRISTOPHER,TA CHRISTOPHER,"[US-8322852-B2, US-2009020683-A1, US-201116624...",0_US,assignees
0,0_root,root,"[US-7040756-B2, US-2007091259-A1, US-7111938-B...",,0
0,0_US,US,"[US-4104187-A, US-6000534-A, US-7281794-B2, US...",0_root,country_code


In [26]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_country_assignee.json", orient="records")

### Dividing by country, then by all levels of IPC hierarchy: A, A21, A21B, A21B 11/04

In [27]:
hierarchy = create_hierarchy(df, ['country_code', 'first_letters', 'three_letters', 'four_letters', 'before_slash', 'ipc_classes'], 
                                 ['value', 'code', 'code', 'code', 'code','code'])

Creating hierarchy with columns: country_code, first_letters, three_letters, four_letters, before_slash, ipc_classes and types: value, code, code, code, code, code

Forward pass. Grouping

0: Splitting by value attribute country_code

1: Splitting by code attribute first_letters

2: Splitting by code attribute three_letters

3: Splitting by code attribute four_letters

4: Splitting by code attribute before_slash

5: Splitting by code attribute ipc_classes

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [28]:
print(hierarchy.head())
hierarchy.tail()

             id        name  \
0  0_D06M15/643  D06M15/643   
0    0_D06P3/58    D06P3/58   
0    0_D06P3/34    D06P3/34   
0    0_D06P3/00    D06P3/00   
0    0_D06P5/02    D06P5/02   

                                                 ids    parent       column  
0                                    [US-9459378-B2]  0_D06M15  ipc_classes  
0                                     [US-6149692-A]   0_D06P3  ipc_classes  
0  [US-5292350-A, US-5534038-A, US-5151106-A, US-...   0_D06P3  ipc_classes  
0  [US-4981487-A, US-5244470-A, US-4494954-A, US-...   0_D06P3  ipc_classes  
0               [US-2013083286-A1, US-2013083287-A1]   0_D06P5  ipc_classes  


Unnamed: 0,id,name,ids,parent,column
0,0_E,E,"[US-2012145942-A1, US-8329763-B2, US-4609493-A]",0_US,first_letters
0,0_F,F,"[US-4707343-A, US-5340399-A, US-5776999-A, US-...",0_US,first_letters
0,0_C,C,"[US-4104187-A, US-5942558-A, US-2009096985-A1,...",0_US,first_letters
0,0_root,root,"[US-7040756-B2, US-2007091259-A1, US-7111938-B...",,0
0,0_US,US,"[US-4104187-A, US-6000534-A, US-7281794-B2, US...",0_root,country_code


In [29]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_country_ipc_classes.json", orient="records")

### Dividing by upper levels of IPC hierarchy: A, A21, A21B, A21B 11 and then by assignee

In [30]:
hierarchy = create_hierarchy(df, ['first_letters', 'three_letters', 'four_letters', 'before_slash', 'assignees'], ['code', 'code', 'code', 'code', 'list'])

Creating hierarchy with columns: first_letters, three_letters, four_letters, before_slash, assignees and types: code, code, code, code, list

Forward pass. Grouping

0: Splitting by code attribute first_letters

1: Splitting by code attribute three_letters

2: Splitting by code attribute four_letters

3: Splitting by code attribute before_slash

4: Splitting by list attribute assignees

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [31]:
print(hierarchy.head())
hierarchy.tail()

                                id                           name  \
0          0_UMAPATHY SENTHILKUMAR          UMAPATHY SENTHILKUMAR   
0  0_MOMENTIVE PERFORMANCE MAT INC  MOMENTIVE PERFORMANCE MAT INC   
0                 0_SAXENA ANUBHAV                 SAXENA ANUBHAV   
0                 0_BHAT SHREEDHAR                 BHAT SHREEDHAR   
0                0_LEWIS KENRICK M                LEWIS KENRICK M   

               ids    parent     column  
0  [US-9459378-B2]  0_D06M15  assignees  
0  [US-9459378-B2]  0_D06M15  assignees  
0  [US-9459378-B2]  0_D06M15  assignees  
0  [US-9459378-B2]  0_D06M15  assignees  
0  [US-9459378-B2]  0_D06M15  assignees  


Unnamed: 0,id,name,ids,parent,column
0,0_B,B,"[US-10194724-B2, US-2005258408-A1, US-5942558-...",0_root,first_letters
0,0_A,A,"[US-4104187-A, US-6000534-A, US-7281794-B2, US...",0_root,first_letters
0,0_E,E,"[US-2012145942-A1, US-8329763-B2, US-4609493-A]",0_root,first_letters
0,0_F,F,"[US-4707343-A, US-5340399-A, US-5776999-A, US-...",0_root,first_letters
0,0_C,C,"[US-4104187-A, US-5942558-A, US-2009096985-A1,...",0_root,first_letters


In [32]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_ipc_classes_assignee.json", orient="records")

### Dividing by upper levels of IPC hierarchy: A, A21, A21B, A21B 11 and then by country

In [33]:
hierarchy = create_hierarchy(df, ['first_letters', 'three_letters', 'four_letters', 'before_slash', 'country_code'], ['code', 'code', 'code', 'code', 'value'])

Creating hierarchy with columns: first_letters, three_letters, four_letters, before_slash, country_code and types: code, code, code, code, value

Forward pass. Grouping

0: Splitting by code attribute first_letters

1: Splitting by code attribute three_letters

2: Splitting by code attribute four_letters

3: Splitting by code attribute before_slash

4: Splitting by value attribute country_code

Backward pass. Aggregating the subgroup counts

parent of 0_root is null
Appended root node: 
       id  name                                                ids parent  \
0  0_root  root  [US-7040756-B2, US-2007091259-A1, US-7111938-B...          

   column  
0       0  


In [34]:
print(hierarchy.head())
hierarchy.tail()

     id name                                                ids    parent  \
0  0_US   US                                    [US-9459378-B2]  0_D06M15   
0  1_US   US  [US-4981487-A, US-5292350-A, US-5244470-A, US-...   0_D06P3   
0  2_US   US  [US-7216975-B2, US-2013083286-A1, US-7048375-B...   0_D06P5   
0  3_US   US  [US-7216975-B2, US-5292350-A, US-5244470-A, US...   0_D06P1   
0  4_US   US                  [US-9459378-B2, US-2015011661-A1]  0_D21H17   

         column  
0  country_code  
0  country_code  
0  country_code  
0  country_code  
0  country_code  


Unnamed: 0,id,name,ids,parent,column
0,0_B,B,"[US-10194724-B2, US-2005258408-A1, US-5942558-...",0_root,first_letters
0,0_A,A,"[US-4104187-A, US-6000534-A, US-7281794-B2, US...",0_root,first_letters
0,0_E,E,"[US-2012145942-A1, US-8329763-B2, US-4609493-A]",0_root,first_letters
0,0_F,F,"[US-4707343-A, US-5340399-A, US-5776999-A, US-...",0_root,first_letters
0,0_C,C,"[US-4104187-A, US-5942558-A, US-2009096985-A1,...",0_root,first_letters


In [35]:
hierarchy.to_json(src_dir + 'hierarchy/' + seed_name + "_ipc_classes_country.json", orient="records")