import os

import gravis as gv  # for visualization of the KG schema and subgraphs, developed by the author of this notebook
import igraph as ig
import pandas as pd

import shared_bmkg

project_name = "hald"
download_dir = os.path.join(project_name, "downloads")
results_dir = os.path.join(project_name, "results")

shared_bmkg.create_dir(download_dir)
shared_bmkg.create_dir(results_dir)

download_specification = [
    ("Entity_Info.json", "https://figshare.com/ndownloader/files/43612509", "1746cde24a1bac0460f1ccf646608cc9"),
    ("Relation_Info.json", "https://figshare.com/ndownloader/files/43612506", "0c1fa199269adc58f64ad4d5b9fd87b9"),

    ("Entities.csv", "https://figshare.com/ndownloader/files/43612494", "b29f16555759edbbd05e59fa34cccdc5"),
    ("Roles.csv", "https://figshare.com/ndownloader/files/43612500", "65ad0206fb61bbc483065e47aa113172"),

    ("Literature_Info.json", "https://figshare.com/ndownloader/files/43612512", "10b78e8ec30f5b85f2a58d8fe24f056b"),

    ("Aging_Biomarkers.json", "https://figshare.com/ndownloader/files/43612503", "abd0eb6cb7295ae500c5d676b7797324"),
    ("Longevity_Biomarkers.json", "https://figshare.com/ndownloader/files/43612497", "0dbd9c3f8474dc3cd744ed38af460d75"),
]

for filename, url, md5 in download_specification:
    filepath = os.path.join(download_dir, filename)
    shared_bmkg.fetch_file(url, filepath)
    shared_bmkg.validate_file(filepath, md5)
    print()

Found a full local copy of "hald/downloads/Entity_Info.json".
MD5 checksum is correct.

Found a full local copy of "hald/downloads/Relation_Info.json".
MD5 checksum is correct.

Found a full local copy of "hald/downloads/Entities.csv".
MD5 checksum is correct.

Found a full local copy of "hald/downloads/Roles.csv".
MD5 checksum is correct.

Found a full local copy of "hald/downloads/Literature_Info.json".
MD5 checksum is correct.

Found a full local copy of "hald/downloads/Aging_Biomarkers.json".
MD5 checksum is correct.

Found a full local copy of "hald/downloads/Longevity_Biomarkers.json".
MD5 checksum is correct.

%%time

data_nodes = shared_bmkg.read_json_file(os.path.join(download_dir, "Entity_Info.json"))
data_edges = shared_bmkg.read_json_file(os.path.join(download_dir, "Relation_Info.json"))

CPU times: user 11.6 s, sys: 3.35 s, total: 14.9 s
Wall time: 15.9 s

%%time

df_neo4j_entities = shared_bmkg.read_csv_file(os.path.join(download_dir, "Entities.csv"))
df_neo4j_roles = shared_bmkg.read_csv_file(os.path.join(download_dir, "Roles.csv"))

CPU times: user 311 ms, sys: 104 ms, total: 415 ms
Wall time: 322 ms

%%time

data_literature_info = shared_bmkg.read_json_file(os.path.join(download_dir, "Literature_Info.json"))

CPU times: user 21.3 s, sys: 7.13 s, total: 28.5 s
Wall time: 30.6 s

%%time

data_aging_biomarkers = shared_bmkg.read_json_file(os.path.join(download_dir, "Aging_Biomarkers.json"))
data_longevity_biomarkers = shared_bmkg.read_json_file(os.path.join(download_dir, "Longevity_Biomarkers.json"))

CPU times: user 58.3 ms, sys: 20.7 ms, total: 79 ms
Wall time: 81.6 ms

num_nodes = len(data_nodes)
num_edges = len(data_edges)

print(f"{num_nodes:,} nodes")
print(f"{num_edges:,} edges")

12,257 nodes
116,495 edges

nt_key = "type"
nt_counts = {}
for key, val in data_nodes.items():
    nt = val[0][nt_key]
    if nt not in nt_counts:
        nt_counts[nt] = 0
    nt_counts[nt] += 1

num_node_types = len(nt_counts)
print(f"{num_node_types} node types, sorted by their frequency of occurrence:")
for nt, cnt in sorted(nt_counts.items(), key=lambda item: -item[1]):
    print(f"- {nt}: {cnt}")

10 node types, sorted by their frequency of occurrence:
- Gene: 5624
- Disease: 3501
- Mutation: 2217
- RNA: 388
- Carbohydrate: 211
- Lipid: 177
- Peptide: 82
- Protein: 29
- Pharmaceutical Preparations: 15
- Toxin: 13

et_key = "relationship"
et_counts = {}
for key, val in data_edges.items():
    et = val[et_key]
    if et not in et_counts:
        et_counts[et] = 0
    et_counts[et] += 1

num_edge_types = len(et_counts)
print(f"{num_edge_types} edge types, sorted by their frequency of occurrence:")
n = 10
print_this = True
for i, (et, cnt) in enumerate(sorted(et_counts.items(), key=lambda item: -item[1])):
    if print_this:
        print(f"- {et}: {cnt}")
    if i == n:
        print('...')
        print_this = False
    if i == len(et_counts) - n:
        print_this = True

3058 edge types, sorted by their frequency of occurrence:
- associated: 19110
- include: 5542
- increase: 2088
- result: 2015
- cause: 2006
- lead: 1990
- occur: 1600
- characterized: 1468
- develop: 1467
- related: 1425
- show: 1250
...
- exhibit performance in: 1
- modify risk of: 1
- be uncommon but devastate cause of: 1
- visualize changes severity in: 1
- contributor to: 1
- Laboratory: 1
- remodeled: 1
- elucidated: 1
- saturated: 1

# Correctness checks

# 1) Do the counts of different node types add up to the total number of nodes?
sum_node_types = sum(nt_counts.values())
assert sum_node_types == num_nodes, f"Node counts differ: {sum_node_types} != {num_nodes}"
print(f"{sum_node_types:,} = {num_nodes:,} nodes")

# 2) Do the counts of different edge types add up to the total number of edges?
sum_edge_types = sum(et_counts.values())
assert sum_edge_types == num_edges, f"Edge counts differ: {sum_edge_types} != {num_edges}"
print(f"{sum_edge_types:,} = {num_edges:,} edges")

12,257 = 12,257 nodes
116,495 = 116,495 edges

def report_first_n_items(data, n):
    for i, item in enumerate(data.items(), 1):
        print(str(item)[:1000], '...')
        print()
        if i == n:
            break

def report_last_n_items(data, n):
    for i, item in enumerate(reversed(data.items()), 1):
        print(str(item)[:1000], '...')
        print()
        if i == n:
            break

report_first_n_items(data_nodes, 2)

('MLH1', [{'entity': 'MLH1', 'type': 'Gene', 'PMID': ['12612901', '30275527', '25311944', '22936446', '19949675', '22406557', '23240038', '11325821', '21042749', '25556597', '17556535', '29425284', '22740444', '10954253', '37380216'], 'official full name': 'mutL homolog 1', 'sentence': [['Most such cancers have the CpG island methylator phenotype (CIMP+) with methylation and transcriptional silencing of the mismatch repair gene MLH1.'], ['Our group recently demonstrated that aging human HSCs accumulate microsatellite instability coincident with loss of MLH1, a DNA Mismatch Repair (MMR) protein, which could reasonably predispose to radiation-induced HSC malignancies.', 'In addition, whole-exome sequencing analysis revealed high SNVs and INDELs in lymphomas being driven by loss of Mlh1 and frequently mutated genes had a strong correlation with human leukemias.'], ['ARID1A loss was observed in 9% (22/257) of the cohort: 24% of MMR-deficient tumors (14/59, 13 of the 14 being MLH1/PMS2 defi ...

('CD4', [{'entity': 'CD4', 'type': 'Gene', 'PMID': ['9434661', '9433953', '15210831', '32041953', '8324202', '27243552', '21057376', '34314231', '33587445', '33225623', '31762303', '26284531', '31088755', '27756678', '16113482', '28708810', '25019430', '29529309', '26635008', '35114631', '31530175', '34233446', '23036045', '27940936', '15090829', '18225989', '30979972', '25356944', '24259252', '29165313', '19217939', '33888343', '29762168', '25833895', '33424857', '18925321', '25075743', '16156949', '30748025', '34633448', '1972177', '23981600', '28700495', '30814781', '23068054', '26423550', '35511728', '27097224', '12679605', '32959881', '21298072', '32251142', '31187337', '2631975', '8219229', '30225704', '28737297', '28127989', '28212619', '35249262', '23255844', '19890183', '17318234', '28002550', '34728337', '34143869', '25360575', '23291591', '8819096', '29808701', '33776993', '34106019', '35003076', '30788516', '29535090', '28462821', '23984974', '15050283', '34791781', '344110 ...

report_last_n_items(data_nodes, 2)

('Ototoxicity', [{'entity': 'Ototoxicity', 'type': 'Disease', 'PMID': ['37319406'], 'official full name': None, 'sentence': [['Additionally, the prevalence of aminoglycoside-induced vestibulotoxicity appears to be greater than cochleotoxicity.']], 'numbers of articles': 1, 'JT': ['American journal of audiology'], 'TA': ['Am J Audiol'], 'IF': [1.8], 'IF5': [2.0], 'year': [2023], 'date': [20231101], 'alias names': '', 'description': 'Damage to the EAR or its function secondary to exposure to toxic substances such as drugs used in CHEMOTHERAPY; IMMUNOTHERAPY; or RADIATION.', 'url': 'https://www.ncbi.nlm.nih.gov/mesh/2031054', 'mutation position': '', 'mutation alleles': '', 'MeSH ID': 'D000081015', 'relation': True, 'external links': [], 'aging biomarker': False, 'longevity biomarker': False}]) ...

('Prurigo', [{'entity': 'Prurigo', 'type': 'Disease', 'PMID': ['37903377'], 'official full name': None, 'sentence': [['Late-onset AD with generalized/prurigo lesions was the most predominant phenotype.']], 'numbers of articles': 1, 'JT': ['Folia medica Cracoviensia'], 'TA': ['Folia Med Cracov'], 'IF': [0.0], 'IF5': [0.0], 'year': [2023], 'date': [20230730], 'alias names': '', 'description': 'A name applied to several itchy skin eruptions of unknown cause.', 'url': 'https://www.ncbi.nlm.nih.gov/mesh/68011536', 'mutation position': '', 'mutation alleles': '', 'MeSH ID': 'D011536', 'relation': False, 'external links': [], 'aging biomarker': False, 'longevity biomarker': False}]) ...

report_first_n_items(data_edges, 2)

('Pulmonary Disease, Chronic Obstructive-defined-Inflammation', {'source entity': 'Pulmonary Disease, Chronic Obstructive', 'relationship': 'defined', 'target entity': 'Inflammation', 'sentence': ['(1) Background: Chronic obstructive pulmonary disease (COPD) is defined as an inflammatory disorder that presents an increasingly prevalent health problem.'], 'source': ['COPD'], 'target': ['inflammatory disorder'], 'source type': ['Disease'], 'target type': ['Disease'], 'PMID': ['30781849'], 'DP': ['2019 Feb 13'], 'date': [20190213], 'TI': ['Chronic Obstructive Pulmonary Disease as a Main Factor of Premature Aging.'], 'TA': ['Int J Environ Res Public Health'], 'IF': [0.0], 'IF5': [0.0], 'method': ['deep learning', 'shortest path']}) ...

('Anorexia-associate-Sarcopenia', {'source entity': 'Anorexia', 'relationship': 'associate', 'target entity': 'Sarcopenia', 'sentence': ["(1) Background: Appetite loss in older people, the 'Anorexia of Aging' (AA), is common, associated with under-nutrition, sarcopenia, and frailty and yet receives little attention."], 'source': ['Anorexia'], 'target': ['sarcopenia'], 'source type': ['Disease'], 'target type': ['Disease'], 'PMID': ['30641897'], 'DP': ['2019 Jan 11'], 'date': [20190111], 'TI': ['Assessment and Treatment of the Anorexia of Aging: A Systematic Review.'], 'TA': ['Nutrients'], 'IF': [5.9], 'IF5': [6.6], 'method': ['deep learning']}) ...

report_last_n_items(data_edges, 2)

('Saxitoxin-cause-Drug-Related Side Effects and Adverse Reactions', {'source entity': 'Saxitoxin', 'relationship': 'cause', 'target entity': 'Drug-Related Side Effects and Adverse Reactions', 'sentence': ['Saxitoxin (STX) causes high toxicity by blocking voltage-gated sodium channels, and it poses a major threat to marine ecosystems and human health worldwide.'], 'source': ['Saxitoxin'], 'target': ['toxicity'], 'source type': ['Toxin'], 'target type': ['Disease'], 'PMID': ['37888479'], 'DP': ['2023 Oct 19'], 'date': [20231019], 'TI': ['Physiological Effects of Oxidative Stress Caused by Saxitoxin in the Nematode Caenorhabditis elegans.'], 'TA': ['Mar Drugs'], 'IF': [0.0], 'IF5': [0.0], 'method': ['shortest path']}) ...

('Triglycerides-protect-Dementia', {'source entity': 'Triglycerides', 'relationship': 'protect', 'target entity': 'Dementia', 'sentence': ['Higher triglyceride levels may be reflective of better overall health and/or lifestyle behaviors that would protect against dementia development.'], 'source': ['triglyceride'], 'target': ['dementia'], 'source type': ['Lipid'], 'target type': ['Disease'], 'PMID': ['37879942'], 'DP': ['2023 Nov 27'], 'date': [20231127], 'TI': ['Association Between Triglycerides and Risk of Dementia in Community-Dwelling Older Adults: A Prospective Cohort Study.'], 'TA': ['Neurology'], 'IF': [9.9], 'IF5': [10.3], 'method': ['shortest path']}) ...

df_neo4j_entities

df_neo4j_roles

num_articles = len(data_literature_info)

print(f"There is information about {num_articles:,} articles used as input for "
      f"the NLP pipeline that identified entities and relations in them.")

There is information about 342,651 articles used as input for the NLP pipeline that identified entities and relations in them.

report_first_n_items(data_literature_info, 1)

('35796512', {'PMID': '35796512', 'TI': 'Inflammatory biomarkers, multi-morbidity, and biologic aging.', 'AB': 'OBJECTIVES: To study the association between multi-morbidity percentiles, which is a measure of clinical aging, and interleukin (IL)-6, IL-10, and tumor necrosis factor (TNF)-alpha. METHODS: Participants 50 to 95 years of age from the Mayo Clinic Study of Aging were assigned age- and sex-specific multi-morbidity percentiles using look-up tables that were reported previously (n = 1646). Percentiles were divided into quintiles for analysis. Plasma IL-6, IL-10, and TNF-alpha levels were measured in 1595 participants. Median inflammatory marker levels were compared across multi-morbidity quintiles using nonparametric tests. RESULTS: People with higher multi-morbidity percentiles had significantly higher IL-6 and TNF-alpha levels compared with those with lower multi-morbidity percentiles. Tests for trend across five multi-morbidity quintiles were significant among women for IL-6 a ...

num_aging_biomarkers = len(data_aging_biomarkers)
num_longevity_biomarkers = len(data_longevity_biomarkers)

print(f"There is information about nodes that were identified as biomarkers by downstream analyses:")
print(f"- {num_aging_biomarkers:,} entries were found to be aging-related biomarkers")
print(f"- {num_longevity_biomarkers:,} entries were found to be longevity-related biomarkers")

There is information about nodes that were identified as biomarkers by downstream analyses:
- 1,871 entries were found to be aging-related biomarkers
- 531 entries were found to be longevity-related biomarkers

report_first_n_items(data_aging_biomarkers, 1)

('GPT', [{'source entity': 'GPT', 'relationship': 'correlated', 'target entity': 'Death', 'sentence': 'RESULTS: Profiling of blood parameters demonstrated that elevated levels of alanine aminotransferase (ALT), total bilirubin (T-bil), blood urea nitrogen (BUN), creatinine (Cr) and a decreased platelet count were significantly correlated with death within 1 week in a training cohort.', 'source': 'alanine aminotransferase', 'target': 'death', 'source type': 'Gene', 'target type': 'Disease', 'PMID': '28011502', 'DP': '2017 Jan', 'date': 20170101, 'TI': 'Objective Predictive Score as a Feasible Biomarker for Short-term Survival in TerminalIy Ill Patients with Cancer.', 'TA': 'Anticancer Res', 'IF': 2.0, 'IF5': 2.2}]) ...

report_first_n_items(data_longevity_biomarkers, 1)

('Glucose', [{'source entity': 'Glucose', 'relationship': 'attenuate', 'target entity': 'Cerebrovascular Disorders', 'sentence': 'Raising NAD+ levels in model organisms by administration of NAD+ precursors improves glucose and lipid metabolism; attenuates diet-induced weight-gain, diabetes, diabetic kidney disease, and hepatic steatosis; reduces endothelial dysfunction; protects heart from ischemic injury; improves left ventricular function in models of heart failure; attenuates cerebrovascular and neurodegenerative disorders; and increases health-span.', 'source': 'glucose', 'target': 'cerebrovascular and neurodegenerative disorders', 'source type': 'Carbohydrate', 'target type': 'Disease', 'PMID': '37364580', 'DP': '2023 Nov 9', 'date': 20231109, 'TI': 'Nicotinamide Adenine Dinucleotide in Aging Biology: Potential Applications and Many Unknowns.', 'TA': 'Endocr Rev', 'IF': 20.3, 'IF5': 25.8}]) ...

node_type_to_color = {
    "Pharmaceutical Preparations": "green",
    "Toxin": "green",

    "Gene": "blue",
    "Peptide": "blue",
    "Protein": "blue",
    "RNA": "blue",

    "Disease": "red",
}

unique_duples_to_edge_types = dict()
for entry in data_edges.values():
    s = entry["source type"][0]
    p = entry["relationship"]
    o = entry["target type"][0]
    duple = (s, o)
    if duple not in unique_duples_to_edge_types:
        unique_duples_to_edge_types[duple] = set()
    unique_duples_to_edge_types[duple].add(p)

gs = ig.Graph(directed=True)
unique_nodes = set()
for (s, o), ps in unique_duples_to_edge_types.items():
    for node in (s, o):
        if node not in unique_nodes:
            unique_nodes.add(node)

            node_size = int(nt_counts[node])
            node_color = node_type_to_color.get(node, '')
            node_hover = f"{node}\n\n{nt_counts[node]} nodes of this type are contained in the knowledge graph."
            gs.add_vertex(node, size=node_size, color=node_color, label_color=node_color, hover=node_hover)

    edge_size = len(ps)  # number of edge types represented by a single arrow
    edge_color = node_type_to_color.get(s, '')
    edge_type_list = ', '.join(f'"{entry}"' for entry in ps)
    edge_hover = (
        f"{s} -> {o}\n\nThere are {len(ps)} different edge types between these two node types, "
        f"represented here with just a single arrow to keep the depiction tidy.\n\nList of edge types:\n{edge_type_list}")
    gs.add_edge(s, o, size=edge_size, color=edge_color, label_color=edge_color, hover=edge_hover)

gs.vcount(), gs.ecount()

(10, 63)

fig = gv.d3(
    gs,
    show_node_label=True,
    node_label_data_source="name",

    show_edge_label=False,
    edge_curvature=0.1,

    use_node_size_normalization=True,
    node_size_normalization_min=10,
    node_size_normalization_max=50,
    node_drag_fix=True,
    node_hover_neighborhood=True,
    
    use_edge_size_normalization=True,
    edge_size_normalization_max=3,

    many_body_force_strength=-3000,
    zoom_factor=1.0,
)
fig

# Export the schema visualization to a standalone HTML file
schema_filepath = os.path.join(results_dir, f"{project_name}_schema.html")
fig.export_html(schema_filepath, overwrite=True)

%%time

nodes = []
for entry in data_nodes.values():
    entry = entry[0]
    node_id = entry["entity"]
    node_type = entry["type"]
    node_properties = {k: v for k, v in entry.items()
                       if k not in ("entity", "type")}
    node = (node_id, node_type, node_properties)  # default format
    nodes.append(node)

CPU times: user 216 ms, sys: 24.3 ms, total: 240 ms
Wall time: 281 ms

%%time

edges = []
for entry in data_edges.values():
    source_id = entry["source entity"]
    target_id = entry["target entity"]
    edge_type = entry["relationship"]
    edge_properties = {k: v for k, v in entry.items()
                       if k not in ("source entity", "target entity", "relationship")}
    edge = (source_id, target_id, edge_type, edge_properties)  # default format
    edges.append(edge)

CPU times: user 4.32 s, sys: 179 ms, total: 4.5 s
Wall time: 4.55 s

nodes_csv_filepath = shared_bmkg.export_nodes_as_csv(nodes, results_dir, project_name)

edges_csv_filepath = shared_bmkg.export_edges_as_csv(edges, results_dir, project_name)

%%time

g = shared_bmkg.create_graph(nodes, edges)

CPU times: user 2.14 s, sys: 28.7 ms, total: 2.16 s
Wall time: 2.17 s

shared_bmkg.report_graph_stats(g)

Directed multigraph with 12257 nodes, 116495 edges and a density of 0.0007754.

# Correctness checks

# 1) Does the reconstructed graph contain the same number of nodes as the raw data?
num_nodes_in_graph = g.vcount()
assert num_nodes_in_graph == num_nodes, f"Node counts differ: {num_nodes_in_graph} != {num_nodes}"
print(f"{num_nodes_in_graph:,} = {num_nodes:,}")

# 2) Does the reconstructed graph contain the same number of edges as the raw data?
num_edges_in_graph = g.ecount()
assert num_edges_in_graph == num_edges, f"Edge counts differ: {num_edges_in_graph} != {num_edges}"
print(f"{num_edges_in_graph:,} = {num_edges:,}")

12,257 = 12,257
116,495 = 116,495

%%time

g_graphml_filepath = shared_bmkg.export_graph_as_graphml(g, results_dir, project_name)

CPU times: user 1.2 s, sys: 116 ms, total: 1.32 s
Wall time: 1.3 s

# Drug: Imatinib - seems not to be contained in HALD
shared_bmkg.list_nodes_matching_substring(g, "imatinib")

id   type 
==========

# Gene: ABL1
shared_bmkg.list_nodes_matching_substring(g, "abl1")

id      type    
================
ABL1    Gene

# Disease: Leukemia - to find Chronic Myeloid Leukemia (CML)
shared_bmkg.list_nodes_matching_substring(g, "leukemia")

id                                                        type       
=====================================================================
Leukemia                                                  Disease    
Leukemia L1210                                            Disease    
Leukemia, B-Cell                                          Disease    
Leukemia, Biphenotypic, Acute                             Disease    
Leukemia, Erythroblastic, Acute                           Disease    
Leukemia, Hairy Cell                                      Disease    
Leukemia, Large Granular Lymphocytic                      Disease    
Leukemia, Lymphocytic, Chronic, B-Cell                    Disease    
Leukemia, Lymphoid                                        Disease    
Leukemia, Mast-Cell                                       Disease    
Leukemia, Megakaryoblastic, Acute                         Disease    
Leukemia, Monocytic, Acute                                Disease    
Leukemia, Myelogenous, Chronic, BCR-ABL Positive          Disease    
Leukemia, Myeloid                                         Disease    
Leukemia, Myeloid, Accelerated Phase                      Disease    
Leukemia, Myeloid, Acute                                  Disease    
Leukemia, Myeloid, Chronic, Atypical, BCR-ABL Negative    Disease    
Leukemia, Myelomonocytic, Chronic                         Disease    
Leukemia, Myelomonocytic, Juvenile                        Disease    
Leukemia, Prolymphocytic                                  Disease    
Leukemia, Prolymphocytic, T-Cell                          Disease    
Leukemia, Promyelocytic, Acute                            Disease    
Leukemia, T-Cell                                          Disease    
Leukemia-Lymphoma, Adult T-Cell                           Disease    
Precursor Cell Lymphoblastic Leukemia-Lymphoma            Disease    
Precursor T-Cell Lymphoblastic Leukemia-Lymphoma          Disease    
Preleukemia                                               Disease

# Neighborhood of gene ABL1
source = "ABL1"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_abl1"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 3 nodes, 2 edges and a density of 0.2222.

# Neighborhood of disease CML
source = "Leukemia, Myelogenous, Chronic, BCR-ABL Positive"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_cml"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
subgraph = subgraph.simplify()  # Reduced subgraph without multi-edges in order to enable better visualization
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 29 nodes, 1535 edges and a density of 1.825.

# Neighborhood of disease CMML - to show a small example with multi-edges
source = "Leukemia, Myelomonocytic, Chronic"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 9 nodes, 126 edges and a density of 1.556.

# Paths from gene TET2 to disease CML
source = "TET2"
target = "Leukemia, Myelogenous, Chronic, BCR-ABL Positive"
subgraph = shared_bmkg.get_paths_subgraph(g, source, target)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source, target)

Directed multigraph with 4 nodes, 4 edges and a density of 0.25.

# Paths from Von Willebrand factor (VWF) to Alzheimer disease - to show an example with more paths
source = "VWF"
target = "Alzheimer Disease"
subgraph = shared_bmkg.get_paths_subgraph(g, source, target)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source, target)

Directed multigraph with 13 nodes, 22 edges and a density of 0.1302.

sqlite_db_filepath = os.path.join(results_dir, f"{project_name}_graph.sqlite")

shared_bmkg.delete_file(sqlite_db_filepath)

sql_cmd = """
CREATE TABLE nodes (
  id TEXT PRIMARY KEY,
  type TEXT,
  properties TEXT
);
"""

shared_bmkg.run_shell_command(['sqlite3', sqlite_db_filepath, sql_cmd])

sql_cmd = """
CREATE TABLE edges (
  source_id TEXT,
  target_id TEXT,
  type TEXT,
  properties TEXT,
  FOREIGN KEY (source_id) REFERENCES nodes(id),
  FOREIGN KEY (target_id) REFERENCES nodes(id)
);
"""

shared_bmkg.run_shell_command(['sqlite3', sqlite_db_filepath, sql_cmd])

sqlite_cmd = f".import --csv --skip 1 {nodes_csv_filepath} nodes"

shared_bmkg.run_shell_command(['sqlite3', sqlite_db_filepath, '-cmd', sqlite_cmd])

sqlite_cmd = f".import --csv --skip 1 {edges_csv_filepath} edges"

shared_bmkg.run_shell_command(['sqlite3', sqlite_db_filepath, '-cmd', sqlite_cmd])

import sqlite3

conn = sqlite3.connect(sqlite_db_filepath)
cursor = conn.cursor()

substring = "leukemia"

query = """
SELECT id, type
FROM nodes
WHERE LOWER(id) LIKE LOWER(?)
ORDER BY id;
"""
search_term = f"%{substring}%"
cursor.execute(query, (search_term,))
result = cursor.fetchall()
for row in result:
    print(row)

('Leukemia', 'Disease')
('Leukemia L1210', 'Disease')
('Leukemia, B-Cell', 'Disease')
('Leukemia, Biphenotypic, Acute', 'Disease')
('Leukemia, Erythroblastic, Acute', 'Disease')
('Leukemia, Hairy Cell', 'Disease')
('Leukemia, Large Granular Lymphocytic', 'Disease')
('Leukemia, Lymphocytic, Chronic, B-Cell', 'Disease')
('Leukemia, Lymphoid', 'Disease')
('Leukemia, Mast-Cell', 'Disease')
('Leukemia, Megakaryoblastic, Acute', 'Disease')
('Leukemia, Monocytic, Acute', 'Disease')
('Leukemia, Myelogenous, Chronic, BCR-ABL Positive', 'Disease')
('Leukemia, Myeloid', 'Disease')
('Leukemia, Myeloid, Accelerated Phase', 'Disease')
('Leukemia, Myeloid, Acute', 'Disease')
('Leukemia, Myeloid, Chronic, Atypical, BCR-ABL Negative', 'Disease')
('Leukemia, Myelomonocytic, Chronic', 'Disease')
('Leukemia, Myelomonocytic, Juvenile', 'Disease')
('Leukemia, Prolymphocytic', 'Disease')
('Leukemia, Prolymphocytic, T-Cell', 'Disease')
('Leukemia, Promyelocytic, Acute', 'Disease')
('Leukemia, T-Cell', 'Disease')
('Leukemia-Lymphoma, Adult T-Cell', 'Disease')
('Precursor Cell Lymphoblastic Leukemia-Lymphoma', 'Disease')
('Precursor T-Cell Lymphoblastic Leukemia-Lymphoma', 'Disease')
('Preleukemia', 'Disease')

%%time

key = "official full name"
substring = "myeloid"

query = f"""
SELECT id, type, json_extract(properties, '$."{key}"') AS official_full_name
FROM nodes
WHERE LOWER(official_full_name) LIKE LOWER(?)
ORDER BY id;
"""
search_term = f'%{substring}%'
cursor.execute(query, (search_term,))
result = cursor.fetchall()

print(f'Nodes with the substring "{substring}" in the value of the key "official full name" in the JSON data')
for row in result:
    print(row)

Nodes with the substring "myeloid" in the value of the key "official full name" in the JSON data
('MLF1', 'Gene', 'myeloid leukemia factor 1')
('MZF1', 'Gene', 'myeloid zinc finger 1')
('TREM1', 'Gene', 'triggering receptor expressed on myeloid cells 1')
('TREM2', 'Gene', 'triggering receptor expressed on myeloid cells 2')
CPU times: user 1.86 s, sys: 383 ms, total: 2.24 s
Wall time: 2.24 s

%%time

key = "official full name"
substring = "myeloid"

query = f"""
SELECT id, type, properties -> '$."{key}"' AS official_full_name
FROM nodes
WHERE LOWER(official_full_name) LIKE LOWER(?)
ORDER BY id;
"""
search_term = f'%{substring}%'
cursor.execute(query, (search_term,))
result = cursor.fetchall()

print(f'Nodes with the substring "{substring}" in the value of the key "official full name" in the JSON data')
for row in result:
    print(row)

Nodes with the substring "myeloid" in the value of the key "official full name" in the JSON data
('MLF1', 'Gene', '"myeloid leukemia factor 1"')
('MZF1', 'Gene', '"myeloid zinc finger 1"')
('TREM1', 'Gene', '"triggering receptor expressed on myeloid cells 1"')
('TREM2', 'Gene', '"triggering receptor expressed on myeloid cells 2"')
CPU times: user 1.94 s, sys: 303 ms, total: 2.24 s
Wall time: 2.24 s

HALD¶

Table of contents¶

1. Setup¶

a) Import packages¶

b) Create data directories¶

2. Data download¶

All files provided by the project¶

Files needed to create the knowledge graph¶

3. Data import¶

4. Data inspection¶

a) Number of nodes and edges¶

b) Types of nodes and edges¶

c) Example entries¶

Nodes together with node annotations¶

Edges together with edge annotations¶

Nodes, edges and annotations in a different format for Neo4j¶

Background information about the used literature¶

5. Schema discovery¶

6. Knowledge graph reconstruction¶

a) Convert the data into an standardized format¶

Nodes¶

Edges¶

b) Export the standardized data to two CSV files¶

c) Use the standardized data to build a graph¶

d) Export the graph to a GraphML file¶

7. Subgraph exploration¶

a) Search for interesting nodes¶

b) Explore the neighborhood of a chosen node¶

c) Find shortest paths between two chosen nodes¶

Appendix: Loading and querying the converted data with SQLite¶

a) Create an SQLite database with a suitable schema¶

b) Load the CSV data¶

c) Query the data¶

Standard SQL query¶

Non-standard SQL query using JSON support of SQLite¶

	entity:ID	name	type	frequency	:LABEL
0	1	Pulmonary Disease, Chronic Obstructive	Disease	1034	Disease
1	2	Inflammation	Disease	4175	Disease
2	3	Anorexia	Disease	147	Disease
3	4	Sarcopenia	Disease	2072	Disease
4	5	GPT	Gene	48	Gene
...	...	...	...	...	...
6917	6918	SUPT5H	Gene	1	Gene
6918	6919	HOXA3	Gene	2	Gene
6919	6920	G6PC1	Gene	2	Gene
6920	6921	OSR1	Gene	2	Gene
6921	6922	Saxitoxin	Toxin	1	Toxin

	:START_ID	:END_ID	relation	weight	method	:TYPE
0	1	2	defined	1	deep learning; shortest path	defined
1	3	4	associate	1	deep learning	associate
2	4	3	associate	1	deep learning	associate
3	5	6	recognized	1	deep learning	recognized
4	5	6	increase	1	shortest path	increase
...	...	...	...	...	...	...
116482	578	2	alleviate	1	shortest path	alleviate
116483	104	1359	associated	1	deep learning; shortest path	associated
116484	1359	104	associated	1	deep learning; shortest path	associated
116485	104	8	protect	1	shortest path	protect
116486	6922	65	cause	1	shortest path	cause

HALD¶

Table of contents¶

1. Setup¶

a) Import packages¶

b) Create data directories¶

2. Data download¶

All files provided by the project¶

Files needed to create the knowledge graph¶

3. Data import¶

4. Data inspection¶

a) Number of nodes and edges¶

b) Types of nodes and edges¶

c) Example entries¶

Nodes together with node annotations¶

Edges together with edge annotations¶

Nodes, edges and annotations in a different format for Neo4j¶

Background information about the used literature¶

Information about a downstream analysis of nodes related to aging or longevity¶

5. Schema discovery¶

6. Knowledge graph reconstruction¶

a) Convert the data into an standardized format¶

Nodes¶

Edges¶

b) Export the standardized data to two CSV files¶

c) Use the standardized data to build a graph¶

d) Export the graph to a GraphML file¶

7. Subgraph exploration¶

a) Search for interesting nodes¶

b) Explore the neighborhood of a chosen node¶

c) Find shortest paths between two chosen nodes¶

Appendix: Loading and querying the converted data with SQLite¶

a) Create an SQLite database with a suitable schema¶

b) Load the CSV data¶

c) Query the data¶

Standard SQL query¶

Non-standard SQL query using JSON support of SQLite¶