import math
import os

import dask.dataframe as dd
import gravis as gv  # for visualization of the KG schema and subgraphs, developed by the author of this notebook
import igraph as ig

import shared_bmkg

project_name = "primekg"
download_dir = os.path.join(project_name, "downloads")
results_dir = os.path.join(project_name, "results")

shared_bmkg.create_dir(download_dir)
shared_bmkg.create_dir(results_dir)

download_specification = [
    ("kg.csv", "https://dataverse.harvard.edu/api/access/datafile/6180620", "aac8191d4fbc5bf09cdf8c3c78b4e75f"),
    ("disease_features.tab", "https://dataverse.harvard.edu/api/access/datafile/6180618", "f8d120497eb69848dc7d971ae30e3cd6"),
    ("drug_features.tab", "https://dataverse.harvard.edu/api/access/datafile/6180619", "e8c67d20e815b0d26d9d91be79adfff8"),

    ("nodes.tab", "https://dataverse.harvard.edu/api/access/datafile/6180617", "4924de04fb3deefa1e0a8dada424538e"),  # MD5 differs on website
    ("edges.csv", "https://dataverse.harvard.edu/api/access/datafile/6180616", "5d4d211a22e88544b78fde2735e797bc"),

    ("README.txt", "https://dataverse.harvard.edu/api/access/datafile/6191270", "608e37d4808bb97643186a1b6dc8f307"),
]

for filename, url, md5 in download_specification:
    filepath = os.path.join(download_dir, filename)
    shared_bmkg.fetch_file(url, filepath)
    shared_bmkg.validate_file(filepath, md5)
    print()

Found a full local copy of "primekg/downloads/kg.csv".
MD5 checksum is correct.

Found a full local copy of "primekg/downloads/disease_features.tab".
MD5 checksum is correct.

Found a full local copy of "primekg/downloads/drug_features.tab".
MD5 checksum is correct.

Found a full local copy of "primekg/downloads/nodes.tab".
MD5 checksum is correct.

Found a full local copy of "primekg/downloads/edges.csv".
MD5 checksum is correct.

Found a full local copy of "primekg/downloads/README.txt".
MD5 checksum is correct.

%%time

df_kg = shared_bmkg.read_csv_file(os.path.join(download_dir, "kg.csv"))
df_drug_node_annotations = shared_bmkg.read_tsv_file(os.path.join(download_dir, "drug_features.tab"))
df_disease_node_annotations = shared_bmkg.read_tsv_file(os.path.join(download_dir, "disease_features.tab"))

df_nodes = shared_bmkg.read_tsv_file(os.path.join(download_dir, "nodes.tab"))
df_edges = shared_bmkg.read_csv_file(os.path.join(download_dir, "edges.csv"))

CPU times: user 50.7 s, sys: 14.7 s, total: 1min 5s
Wall time: 35.6 s

unique_nodes = list(set(df_kg['x_index'].unique().tolist() + df_kg['y_index'].unique().tolist()))
num_nodes = len(unique_nodes)
num_edges = len(df_kg)

print(f"{num_nodes:,} nodes")
print(f"{num_edges:,} edges")

129,375 nodes
8,100,498 edges

%%time

# Ensure each node is considered exactly once
node_to_type = {}
for row in df_kg.itertuples():
    node_to_type[row.x_index] = row.x_type
    node_to_type[row.y_index] = row.y_type

# Count types of the unique nodes
nt_counts = {}
for nt in node_to_type.values():
    if nt not in nt_counts:
        nt_counts[nt] = 0
    nt_counts[nt] += 1

print(len(nt_counts), "node types, sorted by their frequency of occurrence:")
for type, cnt in sorted(nt_counts.items(), key=lambda item: -item[1]):
    print(f"- {type}: {cnt}")
print()

10 node types, sorted by their frequency of occurrence:
- biological_process: 28642
- gene/protein: 27671
- disease: 17080
- effect/phenotype: 15311
- anatomy: 14035
- molecular_function: 11169
- drug: 7957
- cellular_component: 4176
- pathway: 2516
- exposure: 818

CPU times: user 1min 8s, sys: 209 ms, total: 1min 8s
Wall time: 1min 9s

%%time

et_column = "relation"
et_counts = df_kg.groupby(et_column).size()
et_counts = dict(et_counts.sort_values(ascending=False))

print(len(et_counts), "edge types, sorted by their frequency:")
for key, val in et_counts.items():
    print(f"- {key}: {val}")
print()

30 edge types, sorted by their frequency:
- anatomy_protein_present: 3036406
- drug_drug: 2672628
- protein_protein: 642150
- disease_phenotype_positive: 300634
- bioprocess_protein: 289610
- cellcomp_protein: 166804
- disease_protein: 160822
- molfunc_protein: 139060
- drug_effect: 129568
- bioprocess_bioprocess: 105772
- pathway_protein: 85292
- disease_disease: 64388
- contraindication: 61350
- drug_protein: 51306
- anatomy_protein_absent: 39774
- phenotype_phenotype: 37472
- anatomy_anatomy: 28064
- molfunc_molfunc: 27148
- indication: 18776
- cellcomp_cellcomp: 9690
- phenotype_protein: 6660
- off-label use: 5136
- pathway_pathway: 5070
- exposure_disease: 4608
- exposure_exposure: 4140
- exposure_bioprocess: 3250
- exposure_protein: 2424
- disease_phenotype_negative: 2386
- exposure_molfunc: 90
- exposure_cellcomp: 20

CPU times: user 2.39 s, sys: 389 ms, total: 2.78 s
Wall time: 2.84 s

# Correctness checks

# 1) Do the counts of different node types add up to the total number of nodes?
sum_node_types = sum(nt_counts.values())
assert sum_node_types == num_nodes, f"Node counts differ: {sum_node_types} != {num_nodes}"
print(f"{sum_node_types:,} = {num_nodes:,} nodes")

# 2) Do the counts of different edge types add up to the total number of edges?
sum_edge_types = sum(et_counts.values())
assert sum_edge_types == num_edges, f"Edge counts differ: {sum_edge_types} != {num_edges}"
print(f"{sum_edge_types:,} = {num_edges:,} edges")

129,375 = 129,375 nodes
8,100,498 = 8,100,498 edges

def report_first_n_items(data, n):
    return data.head(n)

def report_last_n_items(data, n):
    return data.tail(n)

report_first_n_items(df_kg, 2)

report_last_n_items(df_kg, 2)

report_first_n_items(df_drug_node_annotations, 2)

report_first_n_items(df_disease_node_annotations, 2)

df_nodes

df_edges

node_type_to_color = {
    "drug": "green",

    "gene/protein": "blue",

    "disease": "red",
    "pathway": "red",
    "biological_process": "red",
}

%%time

unique_triples = set()
for row in df_kg.itertuples():
    s = row.x_type
    p = row.relation
    o = row.y_type
    triple = (s, p, o)
    unique_triples.add(triple)

CPU times: user 1min 2s, sys: 63.6 ms, total: 1min 2s
Wall time: 1min 2s

gs = ig.Graph(directed=True)
unique_nodes = set()
for s, p, o in unique_triples:
    for node in (s, o):
        if node not in unique_nodes:
            unique_nodes.add(node)
            
            node_size = int(nt_counts[node])
            node_color = node_type_to_color.get(node, '')
            node_hover = f"{node}\n\n{nt_counts[node]} nodes of this type are contained in the knowledge graph."
            gs.add_vertex(node, size=node_size, color=node_color, label_color=node_color, hover=node_hover)

    edge_size = int(et_counts[p])
    edge_color = node_type_to_color.get(s, '')
    edge_hover = f"{p}\n\n{et_counts[p]} edges of this type are contained in the knowledge graph."
    gs.add_edge(s, o, size=edge_size, color=edge_color, hover=edge_hover, label=p, label_color="gray", label_size=5)

gs.vcount(), gs.ecount()

(10, 50)

fig = gv.d3(
    gs,
    show_node_label=True,
    node_label_data_source="name",

    show_edge_label=True,
    edge_label_data_source="label",
    edge_curvature=0.2,

    use_node_size_normalization=True,
    node_size_normalization_min=10,
    node_size_normalization_max=50,
    node_drag_fix=True,
    node_hover_neighborhood=True,
    
    use_edge_size_normalization=True,
    edge_size_normalization_max=3,

    many_body_force_strength=-3000,
    zoom_factor=0.8,
)
fig

# Export the schema visualization
schema_filepath = os.path.join(results_dir, f"{project_name}_schema.html")
fig.export_html(schema_filepath, overwrite=True)

def is_nan(val):
    if isinstance(val, float):
        return math.isnan(val)
    return False
    
dataframes = [
    df_drug_node_annotations,
    df_disease_node_annotations,
]

node_id_to_annotation_map = {}
for df in dataframes:
    data_columns = list(df.columns)[1:]
    for row in df.itertuples():
        # indexing: 0=id (ignored), 1=name, 2-n=annotations
        node_id = row[1]
        annotation = {col: row[i] for i, col in enumerate(data_columns, 2)}
        annotation = {k:v for k, v in annotation.items()
                      if v is not None
                      and v != ""
                      and not is_nan(v)}
        node_id_to_annotation_map[node_id] = annotation

%%time

nodes = []
seen_node_ids = set()
# x
for row in df_kg.itertuples():
    node_id = row.x_index
    if node_id not in seen_node_ids:
        seen_node_ids.add(node_id)
        node_type = row.x_type
        node_properties = {"identifier": row.x_id, "label": row.x_name, "source": row.x_source}
        if node_id in node_id_to_annotation_map:
            node_properties.update(node_id_to_annotation_map[node_id])
        node = (node_id, node_type, node_properties)
        nodes.append(node)
# y
for row in df_kg.itertuples():
    node_id = row.y_index
    if node_id not in seen_node_ids:
        seen_node_ids.add(node_id)
        node_type = row.y_type
        node_properties = {"identifier": row.y_id, "label": row.y_name, "source": row.y_source}
        if node_id in node_id_to_annotation_map:
            node_properties.update(node_id_to_annotation_map[node_id])
        node = (node_id, node_type, node_properties)
        nodes.append(node)

CPU times: user 1min 44s, sys: 97.4 ms, total: 1min 44s
Wall time: 1min 44s

%%time

edges = []
for row in df_kg.itertuples():
    source_id = row.x_index
    target_id = row.y_index
    edge_type = row.relation
    edge_properties = {"display_relation": row.display_relation}
    edge = (source_id, target_id, edge_type, edge_properties)
    edges.append(edge)

CPU times: user 1min 28s, sys: 5.06 s, total: 1min 33s
Wall time: 1min 34s

nodes_csv_filepath = shared_bmkg.export_nodes_as_csv(nodes, results_dir, project_name)

edges_csv_filepath = shared_bmkg.export_edges_as_csv(edges, results_dir, project_name)

%%time

g = shared_bmkg.create_graph(nodes, edges)

CPU times: user 29.1 s, sys: 3.43 s, total: 32.5 s
Wall time: 32.5 s

shared_bmkg.report_graph_stats(g)

Directed multigraph with 129375 nodes, 8100498 edges and a density of 0.000484.

# Correctness checks

# 1) Does the reconstructed graph contain the same number of nodes as the raw data?
num_nodes_in_graph = g.vcount()
assert num_nodes_in_graph == num_nodes, f"Node counts differ: {num_nodes_in_graph} != {num_nodes}"
print(f"{num_nodes_in_graph:,} = {num_nodes:,}")

# 2) Does the reconstructed graph contain the same number of (unique) edges as the raw data?
num_edges_in_graph = g.ecount()
assert num_edges_in_graph == num_edges, f"Edge counts differ: {num_edges_in_graph} != {num_unique_triples}"
print(f"{num_edges_in_graph:,} = {num_edges:,}")

129,375 = 129,375
8,100,498 = 8,100,498

%%time

g_graphml_filepath = shared_bmkg.export_graph_as_graphml(g, results_dir, project_name)

CPU times: user 1min 5s, sys: 5.9 s, total: 1min 11s
Wall time: 1min 10s

# Drug: Imatinib
shared_bmkg.list_nodes_matching_substring(g, "imatinib", "label")

id        type       label                               
=========================================================
129280    pathway    Imatinib-resistant KIT mutants      
129298    pathway    Imatinib-resistant PDGFR mutants    
14207     drug       Imatinib

# Gene: ABL1
shared_bmkg.list_nodes_matching_substring(g, "abl1", "label")

id       type            label                                                          
========================================================================================
30039    disease         chronic myelogenous leukemia, BCR-ABL1 positive                
601      gene/protein    ABL1                                                           
84261    disease         blast phase chronic myelogenous leukemia, BCR-ABL1 positive    
94836    disease         atypical chronic myeloid leukemia, BCR-ABL1 negative           
99701    disease         acute myeloid leukemia with BCR-ABL1

# Disease: Myeloid Leukemia - to find Chronic Myeloid Leukemia (CML)
shared_bmkg.list_nodes_matching_substring(g, "myeloid leukemia", "label")

id       type                label                                                                                                
==================================================================================================================================
23849    effect/phenotype    Myeloid leukemia                                                                                     
28520    disease             myeloid leukemia                                                                                     
37656    disease             bilineal acute myeloid leukemia                                                                      
37714    disease             unclassified acute myeloid leukemia                                                                  
38274    disease             inherited acute myeloid leukemia                                                                     
39007    disease             acute myeloid leukemia with recurrent genetic anomaly                                                
39124    disease             therapy related acute myeloid leukemia and myelodysplastic syndrome                                  
83852    disease             acute myeloid leukemia with minimal differentiation                                                  
83854    disease             acute myeloid leukemia and myelodysplastic syndromes related to topoisomerase type 2 inhibitor       
83855    disease             acute myeloid leukemia with t(8;21)(q22;q22) translocation                                           
83856    disease             acute myeloid leukemia with CEBPA somatic mutations                                                  
83857    disease             acute myeloid leukemia with t(8;16)(p11;p13) translocation                                           
83858    disease             acute myeloid leukemia with t(6;9)(p23;q34)                                                          
83859    disease             acute myeloid leukemia with t(9;11)(p22;q23)                                                         
83860    disease             acute myeloid leukemia with inv3(p21;q26.2) or t(3;3)(p21;q26.2)                                     
83861    disease             acute myeloid leukemia with NPM1 somatic mutations                                                   
84319    disease             acute myeloid leukemia with abnormal bone marrow eosinophils inv(16)(p13q22) or t(16;16)(p13;q22)    
94152    effect/phenotype    Acute myeloid leukemia                                                                               
94705    disease             megakaryoblastic acute myeloid leukemia with t(1;22)(p13;q13)                                        
94812    disease             childhood acute myeloid leukemia                                                                     
94835    disease             atypical chronic myeloid leukemia                                                                    
94836    disease             atypical chronic myeloid leukemia, BCR-ABL1 negative                                                 
95114    disease             acute myeloid leukemia with multilineage dysplasia                                                   
97918    disease             acute myeloid leukemia and myelodysplastic syndromes related to alkylating agent                     
98001    disease             acute myeloid leukemia and myelodysplastic syndromes related to radiation                            
98797    disease             acute myeloid leukemia with 11q23 abnormalities                                                      
99701    disease             acute myeloid leukemia with BCR-ABL1                                                                 
99930    disease             acute myeloid leukemia with mutated NPM1

# Neighborhood of drug Imatinib
source = 14207
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_imatinib"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)

Directed multigraph with 1724 nodes, 1019430 edges and a density of 0.343.

# Neighborhood of gene ABL1
source = 601
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_abl1"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)

Directed multigraph with 754 nodes, 99842 edges and a density of 0.1756.

# Neighborhood of disease CML
source = 30039
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_cml"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 60 nodes, 338 edges and a density of 0.09389.

# Paths from drug Imatinib to disease AML
source = 14207
target = 99701
subgraph = shared_bmkg.get_paths_subgraph(g, source, target)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source, target)

Directed multigraph with 56 nodes, 259 edges and a density of 0.08259.

	relation	display_relation	x_index	x_id	x_type	x_name	x_source	y_index	y_id	y_type	y_name	y_source
8100496	anatomy_protein_absent	expression absent	64523	2084	anatomy	heart left ventricle	UBERON	58254	105378952	gene/protein	KLF18	NCBI
8100497	anatomy_protein_absent	expression absent	67302	5384	anatomy	nasal cavity epithelium	UBERON	58254	105378952	gene/protein	KLF18	NCBI

	node_index	description	half_life	indication	mechanism_of_action	protein_binding	pharmacodynamics	state	atc_1	atc_2	atc_3	atc_4	category	group	pathway	molecular_weight	tpsa	clogp
0	14012	Copper is a transition metal and a trace eleme...	NaN	For use in the supplementation of total parent...	Copper is absorbed from the gut via high affin...	Copper is nearly entirely bound by ceruloplasm...	Copper is incorporated into many enzymes throu...	Copper is a solid.	NaN	NaN	NaN	NaN	Copper is part of Copper-containing Intrauteri...	Copper is approved and investigational.	NaN	NaN	NaN	NaN
1	14013	Oxygen is an element displayed by the symbol O...	The half-life is approximately 122.24 seconds	Oxygen therapy in clinical settings is used ac...	Oxygen therapy increases the arterial pressure...	Oxygen binds to oxygen-carrying protein in red...	Oxygen therapy improves effective cellular oxy...	Oxygen is a gas.	Oxygen is anatomically related to various.	Oxygen is in the therapeutic group of all othe...	Oxygen is pharmacologically related to all oth...	The chemical and functional group of is medic...	Oxygen is part of Chalcogens ; Elements ; Gase...	Oxygen is approved and vet_approved.	NaN	The molecular weight is 32.0.	Oxygen has a topological polar surface area of...	NaN

	node_index	mondo_id	mondo_name	group_id_bert	group_name_bert	mondo_definition	umls_description	orphanet_definition	orphanet_prevalence	orphanet_epidemiology	orphanet_clinical_description	orphanet_management_and_treatment	mayo_symptoms	mayo_causes	mayo_risk_factors	mayo_complications	mayo_prevention	mayo_see_doc
0	27165	8019	mullerian aplasia and hyperandrogenism	NaN	NaN	Deficiency of the glycoprotein WNT4, associate...	Deficiency of the glycoprotein wnt4, associate...	A rare syndrome with 46,XX disorder of sex dev...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	27165	8019	mullerian aplasia and hyperandrogenism	NaN	NaN	Deficiency of the glycoprotein WNT4, associate...	Deficiency of the glycoprotein wnt4, associate...	A rare syndrome with 46,XX disorder of sex dev...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	node_index	node_id	node_type	node_name	node_source
0	0	9796	gene/protein	PHYHIP	NCBI
1	1	7918	gene/protein	GPANK1	NCBI
2	2	8233	gene/protein	ZRSR2	NCBI
3	3	4899	gene/protein	NRF1	NCBI
4	4	5297	gene/protein	PI4KA	NCBI
...	...	...	...	...	...
129370	129370	R-HSA-936837	pathway	Ion transport by P-type ATPases	REACTOME
129371	129371	R-HSA-997272	pathway	Inhibition of voltage gated Ca2+ channels via...	REACTOME
129372	129372	1062	anatomy	anatomical entity	UBERON
129373	129373	468	anatomy	multi-cellular organism	UBERON
129374	129374	992	anatomy	female gonad	UBERON

	relation	display_relation	x_index	y_index
0	protein_protein	ppi	0	8889
1	protein_protein	ppi	1	2798
2	protein_protein	ppi	2	5646
3	protein_protein	ppi	3	11592
4	protein_protein	ppi	4	2122
...	...	...	...	...
8100493	anatomy_protein_absent	expression absent	66747	5259
8100494	anatomy_protein_absent	expression absent	63824	58254
8100495	anatomy_protein_absent	expression absent	63826	58254
8100496	anatomy_protein_absent	expression absent	64523	58254
8100497	anatomy_protein_absent	expression absent	67302	58254

PrimeKG¶

Table of contents¶

1. Setup¶

a) Import packages¶

b) Create data directories¶

2. Data download¶

All files provided by the project¶

Files needed to create the knowledge graph¶

3. Data import¶

4. Data inspection¶

a) Number of nodes and edges¶

b) Types of nodes and edges¶

c) Example entries¶

Nodes, edges and some annotations¶

Further node annotations¶

Nodes, edges and annotations in a different format¶

5. Schema discovery¶

6. Knowledge graph reconstruction¶

a) Convert the data into a standardized format¶

Prepare node annotations¶

Nodes¶

Edges¶

b) Export the standardized data to two CSV files¶

c) Use the standardized data to build a graph¶

d) Export the graph to a GraphML file¶

7. Subgraph exploration¶

a) Search for interesting nodes¶

b) Explore the neighborhood of a chosen node¶

c) Find shortest paths between two chosen nodes¶