import math
import os

import gravis as gv  # for visualization of the KG schema and subgraphs, developed by the author of this notebook
import igraph as ig
import pandas as pd

import shared_bmkg

project_name = "oregano"
download_dir = os.path.join(project_name, "downloads")
results_dir = os.path.join(project_name, "results")

shared_bmkg.create_dir(download_dir)
shared_bmkg.create_dir(results_dir)

download_specification = [
    ("OREGANO_V2.1.tsv", "https://figshare.com/ndownloader/files/42612700", "3e534cadf7717c705cbd5e6dd8c392a6"),

    ("ACTIVITY.tsv", "https://figshare.com/ndownloader/files/42612697", "d1faec938bd32ec9bb45ae5bba14d7a2"),
    ("COMPOUND.tsv", "https://figshare.com/ndownloader/files/42612676", "d86277d0e849c3774acb3c6be1878b44"),
    ("DISEASES.tsv", "https://figshare.com/ndownloader/files/42612685", "747a48c9786fd50912bb8d7f33d630df"),
    ("EFFECT.tsv", "https://figshare.com/ndownloader/files/42612694", "9b1656a45df7d0369dc2a8e5f52b5c2b"),
    ("GENES.tsv", "https://figshare.com/ndownloader/files/42612673", "8fe9a917b23ab715a268feda50b4a8d7"),
    ("INDICATION.tsv", "https://figshare.com/ndownloader/files/42612691", "eae2cbaebc64b1387cce1755a386d1cb"),
    ("PATHWAYS.tsv", "https://figshare.com/ndownloader/files/42612688", "d3f0abefe355c4b9cd1eea4cc8d58021"),
    ("PHENOTYPES.tsv", "https://figshare.com/ndownloader/files/42612679", "cada3e9316ba4ec654dd2b9ec2afa23c"),
    ("SIDE_EFFECT.tsv", "https://figshare.com/ndownloader/files/42612670", "8388b5a37db250128e50571d42d96cbc"),
    ("TARGET.tsv", "https://figshare.com/ndownloader/files/42612682", "46e189c556b2b010952d4212b8779cef"),

    ("oreganov2.1_metadata_complete.ttl", "https://figshare.com/ndownloader/files/42700234", "b087300f5e597d263a03c02eaae53bc7"),
]

for filename, url, md5 in download_specification:
    filepath = os.path.join(download_dir, filename)
    shared_bmkg.fetch_file(url, filepath)
    shared_bmkg.validate_file(filepath, md5)
    print()

Found a full local copy of "oregano/downloads/OREGANO_V2.1.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/ACTIVITY.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/COMPOUND.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/DISEASES.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/EFFECT.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/GENES.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/INDICATION.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/PATHWAYS.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/PHENOTYPES.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/SIDE_EFFECT.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/TARGET.tsv".
MD5 checksum is correct.

Found a full local copy of "oregano/downloads/oreganov2.1_metadata_complete.ttl".
MD5 checksum is correct.

%%time

df_kg = shared_bmkg.read_tsv_file(os.path.join(download_dir, "OREGANO_V2.1.tsv"), header=None)
df_kg.columns = ["subject", "predicate", "object"]

CPU times: user 2.02 s, sys: 291 ms, total: 2.31 s
Wall time: 2.5 s

%%time

df_activity = shared_bmkg.read_tsv_file(os.path.join(download_dir, "ACTIVITY.tsv"))
df_activity.columns.values[0] = "id"

df_compound = shared_bmkg.read_tsv_file(os.path.join(download_dir, "COMPOUND.tsv"))
df_compound.columns.values[0] = "id"

df_diseases = shared_bmkg.read_tsv_file(os.path.join(download_dir, "DISEASES.tsv"))
df_diseases.columns.values[0] = "id"

df_effect = shared_bmkg.read_tsv_file(os.path.join(download_dir, "EFFECT.tsv"))
df_effect.columns.values[0] = "id"

df_genes = shared_bmkg.read_tsv_file(os.path.join(download_dir, "GENES.tsv"))
df_genes.columns.values[0] = "id"

df_indication = shared_bmkg.read_tsv_file(os.path.join(download_dir, "INDICATION.tsv"))
df_indication.columns.values[0] = "id"

df_pathways = shared_bmkg.read_tsv_file(os.path.join(download_dir, "PATHWAYS.tsv"))
df_pathways.columns.values[0] = "id"

df_phenotypes = shared_bmkg.read_tsv_file(os.path.join(download_dir, "PHENOTYPES.tsv"))
df_phenotypes.columns.values[0] = "id"

df_sideeffect = shared_bmkg.read_tsv_file(os.path.join(download_dir, "SIDE_EFFECT.tsv"))
df_sideeffect.columns.values[0] = "id"

df_target = shared_bmkg.read_tsv_file(os.path.join(download_dir, "TARGET.tsv"))
df_target.columns.values[0] = "id"

CPU times: user 6.52 s, sys: 972 ms, total: 7.49 s
Wall time: 8.12 s

%%time

rdf_kg = shared_bmkg.read_ttl_file(os.path.join(download_dir, "oreganov2.1_metadata_complete.ttl"))

CPU times: user 9min 7s, sys: 11.2 s, total: 9min 19s
Wall time: 9min 26s

unique_nodes = list(set(df_kg['subject'].unique().tolist() + df_kg['object'].unique().tolist()))
num_nodes = len(unique_nodes)
num_edges = len(df_kg)

print(f"{num_nodes:,} nodes")
print(f"{num_edges:,} edges")

88,937 nodes
823,005 edges

raw_triples = [(row.subject, row.predicate, row.object) for row in df_kg.itertuples()]
unique_triples = list(set(raw_triples))

num_raw_triples = len(raw_triples)
num_unique_triples = len(unique_triples)
print(f"{num_raw_triples:,} triples")
print(f"{num_unique_triples:,} unique triples")

823,005 triples
819,884 unique triples

def node_name_to_type(node_name):
    try:
        # Most node names contain both the type and id of a node separated by a ":"
        node_type, _ = node_name.split(':', 1)
        node_type = node_type.lower()
    except Exception:
        # Node names that occur in "has_code" relations as target do not have a ":" and explicit type
        # Here the type "code" is assigned to such nodes to be consistent
        node_type = "code"
    return node_type

nt_counts = dict()
for node_name in unique_nodes:
    nt = node_name_to_type(node_name)
    if nt not in nt_counts:
        nt_counts[nt] = 0    
    nt_counts[nt] += 1

print(len(nt_counts), "node types, sorted by their frequency of occurrence:")
for nt, cnt in sorted(nt_counts.items(), key=lambda item: -item[1]):
    print(f"- {nt}: {cnt}")

12 node types, sorted by their frequency of occurrence:
- compound: 32083
- protein: 14505
- gene: 13363
- disease: 8934
- phenotype: 6854
- side_effect: 5364
- code: 3280
- pathway: 2128
- indication: 2080
- effect: 171
- molecule: 97
- activity: 78

et_column = "predicate"
et_counts = dict(df_kg.groupby(et_column).size().sort_values(ascending=False))

print(len(et_counts), "edge types, sorted by their frequency:")
for key, val in et_counts.items():
    print(f"- {key}: {val}")

18 edge types, sorted by their frequency:
- decrease_efficacy: 215222
- has_target: 202539
- has_side_effect: 112532
- has_phenotype: 88791
- acts_within: 44865
- increase_efficacy: 44114
- has_effect: 19160
- increase_effect: 18999
- causes_condition: 17287
- has_activity: 12584
- gene_product_of: 10881
- increase_activity: 10518
- is_affecting: 8905
- has_indication: 8225
- has_code: 3403
- decrease_activity: 3331
- is_substance_that_treats: 1394
- decrease_effect: 255

# Correctness checks

# 1) Do the counts of different node types add up to the total number of nodes?
sum_node_types = sum(nt_counts.values())
assert sum_node_types == num_nodes, f"Node counts differ: {sum_node_types} != {num_nodes}"
print(f"{sum_node_types:,} = {num_nodes:,} nodes")

# 2) Do the counts of different edge types add up to the total number of edges?
sum_edge_types = sum(et_counts.values())
assert sum_edge_types == num_edges, f"Edge counts differ: {sum_edge_types} != {num_edges}"
print(f"{sum_edge_types:,} = {num_edges:,} edges")

88,937 = 88,937 nodes
823,005 = 823,005 edges

85657 - 88937

-3280

def report_first_n_items(data, n):
    return data.head(n)

def report_last_n_items(data, n):
    return data.tail(n)

report_first_n_items(df_kg, 2)

report_last_n_items(df_kg, 2)

# Caution: Some edges contain a target node without type, i.e. there is no ":" in the node name
df_notype = df_kg[~df_kg['subject'].str.contains(':') | ~df_kg['object'].str.contains(':')]
report_first_n_items(df_notype, 4)

report_first_n_items(df_activity, 2)

report_first_n_items(df_compound, 2)

report_first_n_items(df_diseases, 2)

report_first_n_items(df_effect, 2)

report_first_n_items(df_genes, 2)

report_first_n_items(df_indication, 2)

report_first_n_items(df_pathways, 2)

report_first_n_items(df_phenotypes, 2)

report_first_n_items(df_sideeffect, 2)

report_first_n_items(df_target, 2)

for i, (s, p, o) in enumerate(rdf_kg):
    print("Subject:  ", s)
    print("Predicate:", p)
    print("Object:   ", o)
    print()
    if i >= 4:
        break

Subject:   http://erias.fr/oregano/compound/compound_966
Predicate: http://erias.fr/oregano/#decrease_efficacy
Object:    http://erias.fr/oregano/compound/compound_3132

Subject:   http://erias.fr/oregano/compound/compound_24288
Predicate: http://erias.fr/oregano/#has_target
Object:    http://erias.fr/oregano/protein/protein_2870

Subject:   http://erias.fr/oregano/protein/protein_20017
Predicate: http://erias.fr/oregano/#hpa
Object:    ensg00000127125

Subject:   http://erias.fr/oregano/compound/compound_11152
Predicate: http://erias.fr/oregano/#increase_efficacy
Object:    http://erias.fr/oregano/compound/compound_1530

Subject:   http://erias.fr/oregano/protein/protein_14672
Predicate: http://erias.fr/oregano/#bgee
Object:    ensg00000108813

node_type_to_color = {
    "compound": "green",
    "molecule": "green",

    "gene": "blue",
    "protein": "blue",

    "disease": "red",
    "pathway": "red",
}

%%time

unique_triples = set()
for row in df_kg.itertuples():
    s = node_name_to_type(row.subject)
    p = row.predicate
    o = node_name_to_type(row.object)
    triple = (s, p, o)
    unique_triples.add(triple)

CPU times: user 4.77 s, sys: 30 μs, total: 4.77 s
Wall time: 4.77 s

gs = ig.Graph(directed=True)
unique_nodes = set()
for s, p, o in unique_triples:
    for node in (s, o):
        if node not in unique_nodes:
            unique_nodes.add(node)

            node_size = int(nt_counts[node])
            node_color = node_type_to_color.get(node, '')
            node_hover = f"{node}\n\n{nt_counts[node]} nodes of this type are contained in the knowledge graph."
            gs.add_vertex(node, size=node_size, color=node_color, label_color=node_color, hover=node_hover)

    edge_size = int(et_counts[p])
    edge_color = node_type_to_color.get(s, '')
    edge_hover = f"{p}\n\n{et_counts[p]} edges of this type are contained in the knowledge graph."
    gs.add_edge(s, o, size=edge_size, color=edge_color, hover=edge_hover, label=p, label_color="gray", label_size=5)

gs.vcount(), gs.ecount()

(12, 19)

fig = gv.d3(
    gs,
    show_node_label=True,
    node_label_data_source="name",

    show_edge_label=True,
    edge_label_data_source="label",
    edge_curvature=0.2,

    use_node_size_normalization=True,
    node_size_normalization_min=10,
    node_size_normalization_max=50,
    node_drag_fix=True,
    node_hover_neighborhood=True,
    
    use_edge_size_normalization=True,
    edge_size_normalization_max=3,

    many_body_force_strength=-3000,
    zoom_factor=0.8,
)
fig

# Export the schema visualization
schema_filepath = os.path.join(results_dir, f"{project_name}_schema.html")
fig.export_html(schema_filepath, overwrite=True)

%%time

def strip_if_str(val):
    if isinstance(val, str):
        val = val.strip()
    return val

def is_nan(val):
    if isinstance(val, float):
        return math.isnan(val)
    return False

dataframes = [
    df_activity,
    df_compound,
    df_diseases,
    df_effect,
    df_genes,
    df_indication,
    df_pathways,
    df_phenotypes,
    df_sideeffect,
    df_target,
]
node_name_to_annotation_map = {}
for df in dataframes:
    data_columns = list(df.columns)[1:]
    for row in df.itertuples():
        # indexing: 0=id (ignored), 1=name, 2-n=annotations
        node_name = row[1]
        annotation = {col: row[i] for i, col in enumerate(data_columns, 2)}
        annotation = {strip_if_str(k):strip_if_str(v) for k, v in annotation.items()
                      if v is not None
                      and strip_if_str(v) != ""
                      and not is_nan(v)}
        node_name_to_annotation_map[node_name] = annotation

CPU times: user 18.1 s, sys: 136 ms, total: 18.3 s
Wall time: 18.3 s

for i, (k, v) in enumerate(node_name_to_annotation_map.items()):
    if i >= 20000:
        print(k, v)
    if i >= 20004:
        break

COMPOUND:28063 {'CHEMBL': 'CHEMBL465430', 'PUBCHEM COMPOUND': '44567115', 'NPASS': 'NPC171225'}
COMPOUND:28074 {'CHEMBL': 'CHEMBL510705', 'PUBCHEM COMPOUND': '11216146', 'NPASS': 'NPC171317'}
COMPOUND:28079 {'CHEMBL': 'CHEMBL2426510', 'PUBCHEM COMPOUND': '72708559', 'NPASS': 'NPC171360'}
COMPOUND:28080 {'CHEMBL': 'CHEMBL479677', 'PUBCHEM COMPOUND': '11687274', 'NPASS': 'NPC171372'}
COMPOUND:28081 {'CHEMBL': 'CHEMBL447356', 'PUBCHEM COMPOUND': '25009833', 'NPASS': 'NPC17138'}

%%time

nodes = []
seen_nodes = set()
for row in df_kg.itertuples():
    for node_name in (row.subject, row.object):
        if node_name not in seen_nodes:
            seen_nodes.add(node_name)

            node_id = node_name
            node_type = node_name_to_type(node_name)
            node_properties = node_name_to_annotation_map.get(node_id, {})
            node = (node_id, node_type, node_properties)  # default format
            nodes.append(node)

CPU times: user 3.37 s, sys: 8.11 ms, total: 3.38 s
Wall time: 3.38 s

%%time

edges = []
seen_triples = set()
for row in df_kg.itertuples():
    source_id = row.subject
    target_id = row.object
    edge_type = row.predicate
    edge_properties = {}
    
    triple = (source_id, edge_type, target_id)
    if triple not in seen_triples:
        seen_triples.add(triple)

        edge = (source_id, target_id, edge_type, edge_properties)  # default format
        edges.append(edge)

CPU times: user 4.59 s, sys: 368 ms, total: 4.96 s
Wall time: 4.96 s

nodes_csv_filepath = shared_bmkg.export_nodes_as_csv(nodes, results_dir, project_name)

edges_csv_filepath = shared_bmkg.export_edges_as_csv(edges, results_dir, project_name)

%%time

g = shared_bmkg.create_graph(nodes, edges)

CPU times: user 14.7 s, sys: 764 ms, total: 15.5 s
Wall time: 15.5 s

shared_bmkg.report_graph_stats(g)

Directed multigraph with 88937 nodes, 819884 edges and a density of 0.0001037.

# Correctness checks

# 1) Does the reconstructed graph contain the same number of nodes as the raw data?
num_nodes_in_graph = g.vcount()
assert num_nodes_in_graph == num_nodes, f"Node counts differ: {num_nodes_in_graph} != {num_nodes}"
print(f"{num_nodes_in_graph:,} = {num_nodes:,}")

# 2) Does the reconstructed graph contain the same number of (unique) edges as the raw data?
num_edges_in_graph = g.ecount()
assert num_edges_in_graph == num_unique_triples, f"Edge counts differ: {num_edges_in_graph} != {num_unique_triples}"
print(f"{num_edges_in_graph:,} = {num_unique_triples:,}")

88,937 = 88,937
819,884 = 819,884

%%time

g_graphml_filepath = shared_bmkg.export_graph_as_graphml(g, results_dir, project_name)

CPU times: user 56.9 s, sys: 3.79 s, total: 1min
Wall time: 59.8 s

def print_vertex(v):
    print(v.index)
    for k, v in v.attributes().items():
        if v is not None:
            print(f"- {k}: {v}")

def find_node_by_drugbank_id(graph, drugbank_id):
    for v in graph.vs:
        if v["DRUGBANK"] == drugbank_id:
            print_vertex(v)
            print()

# Drug: Imatinib - with DrugBank accession number "DB00619"
find_node_by_drugbank_id(g, "DB00619")

1059
- name: COMPOUND:605
- type: compound
- DRUGS PRODUCT DATABASE: 12429
- DPD: 2253283.0
- CHEMBL: CHEMBL941
- WIKIPEDIA: Imatinib
- PHARMGKB: PA10804
- THERAPEUTIC TARGETS DATABASE: DNC001383
- FDA DRUG LABEL AT DAILYMED: 211ef2da-2868-4a77-8055-1cb2cd78e24b
- URL: http://en.wikipedia.org/wiki/Imatinib
- KEGG DRUG: D01441
- ATC: L01EA01;L01XE01
- PUBCHEM COMPOUND: 5291.0
- PUBCHEM SUBSTANCE: 46505055;841977
- DRUGBANK: DB00619
- RXCUI: 282388.0
- NDC: 0078-0401-34
- CLINICALTRIALS.GOV: NCT00180830;NCT01437202;NCT01860456
- CHEMSPIDER: 5101
- CHEBI: CHEBI:45783
- BINDINGDB: 13530
- ZINC: ZINC000019632618
- PDB LIGAND: STI
- PDB: STI

def find_node_by_ncbi_gene_id(graph, ncbi_gene_id):
    for v in graph.vs:
        if v["NCBI GENE"] == ncbi_gene_id:
            print_vertex(v)
            print()

# Gene: ABL1 - with NCBI Gene ID "25"
find_node_by_ncbi_gene_id(g, "25")

18276
- name: GENE:27548
- type: gene
- NCBI GENE: 25

62046
- name: GENE:117
- type: gene
- UNIPROTKB: P00519;Q59FK4
- PHARMGKB: PA24413
- HGNC: HGNC76
- GENATLAS: ABL1
- ENSEMBL: ENSG00000097007
- NCBI GENE: 25
- OMIM: 189980
- REFSEQ PROTEIN: NP_005148;NP_009297
- MODBASE: P00519
- REFSEQ RNA: NM_005157;NM_007313
- HUMANCYC GENE: HS01874
- GENECARD: ABL1
- REFSEQ DNA: NG_012034;NT_035014
- UCSC GENOME BROWSER: NM_005157
- COMPARATIVE TOXICOGENOMICS DATABASE: 25.0

def find_node_by_mesh_id(graph, mesh_id):
    for v in graph.vs:
        if v["MESH"] == mesh_id:
            print(v["name"])

# Disease: Chronic Myelogenous Leukemia (CML) - with MeSH ID "D015464"
find_node_by_mesh_id(g, "D015464")

DISEASE:1822

# Disease: Acute Myelogenous Leukemia (AML) - with MeSH ID "D015470"
find_node_by_mesh_id(g, "D015470")

DISEASE:1824

# Neighborhood of drug Imatinib
source = "COMPOUND:605"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_imatinib"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 144 nodes, 728 edges and a density of 0.03511.

# Neighborhood of gene ABL1
source = "GENE:117"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_abl1"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 19 nodes, 20 edges and a density of 0.0554.

# Neighborhood of disease CML
source = "DISEASE:1822"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_cml"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 25 nodes, 56 edges and a density of 0.0896.

# Paths from drug Imatinib to disease AML
source = "COMPOUND:605"
target = "DISEASE:1824"
subgraph = shared_bmkg.get_paths_subgraph(g, source, target)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source, target)

Directed multigraph with 7 nodes, 10 edges and a density of 0.2041.

	subject	predicate	object
823003	PROTEIN:22095	gene_product_of	GENE:31330
823004	PROTEIN:22096	gene_product_of	GENE:25313

	id	NAME_DRUGBANK
0	ACTIVITY:1	the anticoagulant activities
1	ACTIVITY:10	the hepatotoxic activities

	id	DRUGBANK	ATC	DRUGS PRODUCT DATABASE	PUBCHEM SUBSTANCE	KEGG DRUG	PHARMGKB	UNIPROTKB	THERAPEUTIC TARGETS DATABASE	WIKIPEDIA	...	NDC	THERAPEUTIC TARGETS DATABASE	CAS	GENBANK	UNIPROT	CLINICALTRIALS.GOV	DPD	URL	FDA DRUG LABEL AT DAILYMED	IUPHAR LIGAND
0	COMPOUND:1	DB00001	B01AE02	11916	46507011	D06880	PA450195	P01050	DAP000541	Lepirudin	...	50419-150-57	DAP000541	NaN	NaN	P01050	NaN	NaN	NaN	NaN	NaN
1	COMPOUND:2	DB00002	L01FE01	13175	46507042	D03455	PA10040	NaN	DNC000788	Cetuximab	...	66733-948-23	DNC000788	NaN	J00228	NaN	NaN	NaN	NaN	NaN	NaN

	id	PHARMGKB	MESH	SNOMEDCT	UMLS	NDFRT	MEDDRA	SNOMEDCT	NDFRT	MESH	MEDDRA	UMLS	ORPHANET	ICD-11	ICD-10	OMIM	GARD
0	DISEASE:1	PA165108557	D054868	4325000	C0795841	N0000181176	NaN	NaN	NaN	NaN	NaN	C0795841	2308	LD44.B0	Q93.5	NaN	307
1	DISEASE:2	PA446766	D018784	75100008	C0243001	N0000003856	10060921	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	id	NAME_DRUGBANK
0	EFFECT:1	bleeding and hemorrhage
1	EFFECT:10	infection and neutropenia

OREGANO¶

Table of contents¶

1. Setup¶

a) Import packages¶

b) Create data directories¶

2. Data download¶

All files provided by the project¶

Files needed to create the knowledge graph¶

3. Data import¶

4. Data inspection¶

a) Number of nodes and edges¶

b) Types of nodes and edges¶

c) Example entries¶

Edges and the nodes they connect¶

Node annotations¶

Nodes, edges and annotations in a different format for semantic web technologies¶

5. Schema discovery¶

6. Knowledge graph reconstruction¶

a) Convert the data into a standardized format¶

Prepare node annotations¶

Nodes¶

Edges¶

b) Export the standardized data to two CSV files¶

c) Use the standardized data to build a graph¶

d) Export the graph to a GraphML file¶

7. Subgraph exploration¶

a) Search for interesting nodes¶

Drugs¶

Genes¶

Diseases¶

b) Explore the neighborhood of a chosen node¶

c) Find shortest paths between two chosen nodes¶

	subject	predicate	object
0	COMPOUND:1	has_code	B01AE02
1	COMPOUND:2	has_code	L01FE01
2	COMPOUND:3	has_code	R05CB13
3	COMPOUND:4	has_code	L01XX29

	id	PHARMGKB	COMPARATIVE TOXICOGENOMICS DATABASE	ENSEMBL	GENATLAS	GENECARD	HGNC	HUMANCYC GENE	MODBASE	NCBI GENE	...	ENSEMBL	GENATLAS	GENECARD	HGNC	IUPHAR RECEPTOR	URL	PHARMVAR GENE	GENBANK	COMPARATIVE TOXICOGENOMICS DATABASE	NCBI GENE
0	GENE:1	PA24356	1.0	ENSG00000121410	A1BG	A1BG	HGNC5	HS04495	P04217	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	GENE:2	PA165392995	NaN	NaN	NaN	A1BG-AS1	HGNC37133	NaN	NaN	503538	...	ENSG00000268895	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	id	SIDER	NAME
0	INDICATION:1	C0015544	Failure to Thrive
1	INDICATION:2	C0020615	Hypoglycemia

	id	HPO	UMLS	MSH	SNOMEDCT_US	MEDDRA	FYLER	NCIT	COHD	EFO	...	EPCC	DOID	MONDO	ICD-O	MP	MPATH	PMID	ORPHA	SNOMED_CT	ICD-9
0	PHENOTYPE:1	HP:0000001	C0444868	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	PHENOTYPE:2	HP:0000002	C4025901	NaN	NaN	NaN	NaN	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	id	SIDER	NAME
0	SIDE_EFFECT:1	C0000729	Abdominal cramps
1	SIDE_EFFECT:2	C0000737	Abdominal pain

	id	HUGO GENE NOMENCLATURE COMMITTEE	GENATLAS	GENBANK GENE DATABASE	GENBANK PROTEIN DATABASE	GUIDE TO PHARMACOLOGY	UNIPROTKB	UNIPROT ACCESSION	DRUGBANK	IUPHAR	...	PCDDB	SFLD	MOONPROT	DEPOD	UNILECTIN	PEROXIBASE	ALLERGOME	IMGT_GENE-DB	REBASE	PATRIC
0	PROTEIN:1	HGNC:3535	F2	M17262	339641.0	2362.0	P00734;B2R7F7;B4E1A7;Q4QZ40;Q53H04;Q53H06;Q69E...	THRB_HUMAN	BE0000048	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	PROTEIN:2	HGNC:3236	EGFR	X00588	757924.0	1797.0	P00533;O00688;O00732;P06268;Q14225;Q68GS5;Q927...	EGFR_HUMAN	BE0000767	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	id	REACTOME
0	PATHWAY:10	R-HSA-418990
1	PATHWAY:100	R-HSA-9614657