import os

import dask.dataframe as dd
import gravis as gv
import igraph as ig

import shared_bmkg

project_name = "monarch"
download_dir = os.path.join(project_name, "downloads")
results_dir = os.path.join(project_name, "results")

shared_bmkg.create_dir(download_dir)
shared_bmkg.create_dir(results_dir)

download_specification = [
    # TAR archive of two files that contain the knowledge graph and annotations for nodes and edges
    # - monarch-kg_nodes.tsv: TSV file that contains the node data
    # - monarch-kg_edges.tsv: TSV file that contains the edge data
    ("monarch-kg.tar.gz", "https://data.monarchinitiative.org/monarch-kg/2024-07-12/monarch-kg.tar.gz", "df24db3c3b743c5829af71b6cd41c9fb"),
]

for filename, url, md5 in download_specification:
    filepath = os.path.join(download_dir, filename)
    shared_bmkg.fetch_file(url, filepath)
    shared_bmkg.validate_file(filepath, md5)
    print()

Found a full local copy of "monarch/downloads/monarch-kg.tar.gz".
MD5 checksum is correct.

filepath_extracted_1 = os.path.join(download_dir, "monarch-kg_nodes.tsv")
filepath_extracted_2 = os.path.join(download_dir, "monarch-kg_edges.tsv")

if os.path.isfile(filepath_extracted_1) and os.path.isfile(filepath_extracted_2):
    print("Found existing files from a previous extraction of the archive.")
else:
    filepath = os.path.join(download_dir, "monarch-kg.tar.gz")
    shared_bmkg.extract_tar_gz(filepath)

Found existing files from a previous extraction of the archive.

def read_tsv_file(filepath):
    with open(filepath) as f:
        # A Dask dataframe, not Pandas
        df = dd.read_csv(filepath, sep='\t', dtype=str)
    return df

%%time

df_nodes = read_tsv_file(os.path.join(download_dir, "monarch-kg_nodes.tsv"))
df_edges = read_tsv_file(os.path.join(download_dir, "monarch-kg_edges.tsv"))

CPU times: user 119 ms, sys: 23.9 ms, total: 143 ms
Wall time: 151 ms

%%time

num_nodes = len(df_nodes)
num_edges = len(df_edges)

print(f"{num_nodes:,} nodes")
print(f"{num_edges:,} edges")
print()

1,028,155 nodes
11,076,689 edges

CPU times: user 4min 3s, sys: 29.7 s, total: 4min 32s
Wall time: 2min 57s

%%time

nt_column = "category"
nt_counts = df_nodes.groupby(nt_column).size().compute()
nt_counts = nt_counts.sort_values(ascending=False)

print(len(nt_counts), "node types, sorted by their frequency of occurrence:")
for type, cnt in sorted(nt_counts.items(), key=lambda item: -item[1]):
    print(f"- {type}: {cnt}")
print()

80 node types, sorted by their frequency of occurrence:
- biolink:Gene: 571074
- biolink:Genotype: 133380
- biolink:PhenotypicFeature: 124247
- biolink:BiologicalProcessOrActivity: 38308
- biolink:Disease: 28109
- biolink:GrossAnatomicalStructure: 24210
- biolink:Cell: 22454
- biolink:Pathway: 22343
- biolink:NamedThing: 19576
- biolink:SequenceVariant: 13022
- biolink:AnatomicalEntity: 9978
- biolink:CellularComponent: 5308
- biolink:MolecularEntity: 4618
- biolink:BiologicalProcess: 3656
- biolink:MacromolecularComplex: 2120
- biolink:MolecularActivity: 1446
- biolink:Protein: 1112
- biolink:CellularOrganism: 958
- biolink:Vertebrate: 547
- biolink:Virus: 321
- biolink:BehavioralFeature: 297
- biolink:ChemicalEntity: 267
- biolink:LifeStage: 238
- biolink:PathologicalProcess: 231
- biolink:Drug: 100
- biolink:SmallMolecule: 70
- biolink:OrganismTaxon: 26
- biolink:InformationContentEntity: 23
- biolink:NucleicAcidEntity: 18
- biolink:EvidenceType: 16
- biolink:RNAProduct: 8
- biolink:Transcript: 6
- biolink:Plant: 4
- biolink:Fungus: 4
- biolink:ProcessedMaterial: 3
- biolink:PopulationOfIndividualOrganisms: 2
- biolink:Activity: 2
- biolink:ConfidenceLevel: 2
- biolink:Publication: 2
- biolink:Mammal: 2
- biolink:Agent: 2
- biolink:ProteinFamily: 2
- biolink:Dataset: 2
- biolink:GeneticInheritance: 2
- biolink:EnvironmentalFeature: 2
- biolink:Invertebrate: 2
- biolink:Haplotype: 2
- biolink:Bacterium: 1
- biolink:ChemicalMixture: 1
- biolink:ChemicalExposure: 1
- biolink:CellLine: 1
- biolink:OrganismalEntity: 1
- biolink:Event: 1
- biolink:EnvironmentalProcess: 1
- biolink:DrugExposure: 1
- biolink:Human: 1
- biolink:ProteinDomain: 1
- biolink:Patent: 1
- biolink:Study: 1
- biolink:AccessibleDnaRegion: 1
- biolink:BiologicalSex: 1
- biolink:StudyVariable: 1
- biolink:Zygosity: 1
- biolink:ReagentTargetedGene: 1
- biolink:Exon: 1
- biolink:DiagnosticAid: 1
- biolink:DatasetDistribution: 1
- biolink:Genome: 1
- biolink:MaterialSample: 1
- biolink:MicroRNA: 1
- biolink:IndividualOrganism: 1
- biolink:GenotypicSex: 1
- biolink:Polypeptide: 1
- biolink:PhenotypicSex: 1
- biolink:RegulatoryRegion: 1
- biolink:SiRNA: 1
- biolink:Snv: 1
- biolink:TranscriptionFactorBindingSite: 1
- biolink:Treatment: 1
- biolink:WebPage: 1

CPU times: user 4.69 s, sys: 841 ms, total: 5.53 s
Wall time: 2.56 s

%%time

et_column = "predicate"
et_counts = df_edges.groupby(et_column).size().compute()
et_counts = et_counts.sort_values(ascending=False)

print(len(et_counts), "edge types, sorted by their frequency:")
for key, val in et_counts.items():
    print(f"- {key}: {val}")
print()

28 edge types, sorted by their frequency:
- biolink:interacts_with: 2799181
- biolink:expressed_in: 2320065
- biolink:has_phenotype: 1703070
- biolink:enables: 839097
- biolink:actively_involved_in: 787306
- biolink:orthologous_to: 551418
- biolink:located_in: 500184
- biolink:subclass_of: 491204
- biolink:related_to: 282852
- biolink:participates_in: 272586
- biolink:acts_upstream_of_or_within: 181576
- biolink:active_in: 160549
- biolink:part_of: 96113
- biolink:causes: 16839
- biolink:is_sequence_variant_of: 15605
- biolink:model_of: 9902
- biolink:acts_upstream_of: 9366
- biolink:has_mode_of_inheritance: 8577
- biolink:gene_associated_with_condition: 8026
- biolink:contributes_to: 7746
- biolink:treats_or_applied_or_studied_to_treat: 5653
- biolink:associated_with_increased_likelihood_of: 3244
- biolink:colocalizes_with: 2937
- biolink:genetically_associated_with: 2156
- biolink:acts_upstream_of_positive_effect: 549
- biolink:acts_upstream_of_or_within_positive_effect: 512
- biolink:acts_upstream_of_negative_effect: 196
- biolink:acts_upstream_of_or_within_negative_effect: 180

CPU times: user 1min 39s, sys: 13.3 s, total: 1min 52s
Wall time: 28.4 s

# Correctness checks

# 1) Do the counts of different node types add up to the total number of nodes?
sum_node_types = nt_counts.sum()
assert sum_node_types == num_nodes, f"Node counts differ: {sum_node_types} != {num_nodes}"
print(f"{sum_node_types:,} = {num_nodes:,} nodes")

# 2) Do the counts of different edge types add up to the total number of edges?
sum_edge_types = et_counts.sum()
assert sum_edge_types == num_edges, f"Edge counts differ: {sum_edge_types} != {num_edges}"
print(f"{sum_edge_types:,} = {num_edges:,} edges")

1,028,155 = 1,028,155 nodes
11,076,689 = 11,076,689 edges

def report_first_n_items(data, n):
    return data.head(n)

def report_last_n_items(data, n):
    return data.tail(n)

report_first_n_items(df_nodes, 2)

report_last_n_items(df_nodes, 2)

report_first_n_items(df_edges, 2)

report_last_n_items(df_edges, 2)

node_type_to_color = {
    "biolink:Drug": "green",
    "biolink:ChemicalEntity": "green",
    "biolink:MolecularEntity": "green",
    "biolink:SmallMolecule": "green",

    "biolink:Gene": "blue",
    "biolink:Protein": "blue",

    "biolink:Disease": "red",
    "biolink:Pathway": "red",
    "biolink:BiologicalProcessOrActivity": "red",
}

%%time

node_id_to_type = {row.id: row.category for row in df_nodes.itertuples()}

CPU times: user 20.4 s, sys: 1.76 s, total: 22.1 s
Wall time: 22.2 s

%%time

unique_triples = set()
for row in df_edges.itertuples():
    s = node_id_to_type[row.subject]
    p = row.predicate
    o = node_id_to_type[row.object]
    triple = (s, p, o)
    unique_triples.add(triple)

CPU times: user 4min 42s, sys: 15.3 s, total: 4min 57s
Wall time: 4min 57s

gs = ig.Graph(directed=True)
unique_nodes = set()
for s, p, o in unique_triples:
    for node in (s, o):
        if node not in unique_nodes:
            unique_nodes.add(node)
            
            node_size = int(nt_counts[node])
            node_color = node_type_to_color.get(node, '')
            node_hover = f"{node}\n\n{nt_counts[node]} nodes of this type are contained in the knowledge graph."
            gs.add_vertex(node, size=node_size, color=node_color, label_color=node_color, hover=node_hover)

    edge_size = int(et_counts[p])
    edge_color = node_type_to_color.get(s, '')
    edge_hover = f"{p}\n\n{et_counts[p]} edges of this type are contained in the knowledge graph."
    gs.add_edge(s, o, size=edge_size, color=edge_color, hover=edge_hover, label=p, label_color="gray", label_size=5)

gs.vcount(), gs.ecount()

(36, 288)

fig = gv.d3(
    gs,
    show_node_label=True,
    node_label_data_source="name",

    show_edge_label=True,
    edge_label_data_source="label",
    edge_curvature=0.2,

    use_node_size_normalization=True,
    node_size_normalization_min=10,
    node_size_normalization_max=50,
    node_drag_fix=True,
    node_hover_neighborhood=True,
    
    use_edge_size_normalization=True,
    edge_size_normalization_max=3,

    many_body_force_strength=-3000,
    zoom_factor=0.3,
)
fig

# Export the schema visualization
schema_filepath = os.path.join(results_dir, f"{project_name}_schema.html")
fig.export_html(schema_filepath, overwrite=True)

%%time

nodes = []
for row in df_nodes.itertuples():
    node_id = row.id
    node_type = row.category
    node_properties = {
        "label": row.name,  # Caution: The attribute "name" is reserved in igraph as unique identifier of a node, therefore using "label"
        "description": row.description,
        "xref": row.xref,
        "provided_by": row.provided_by,
        "synonym": row.synonym,
        "full_name": row.full_name,
        "in_taxon": row.in_taxon,
        "in_taxon_label": row.in_taxon_label,
        "symbol": row.symbol,
        "deprecated": row.deprecated,
        "iri": row.iri,
        "same_as": row.same_as,
    }
    node = (node_id, node_type, node_properties)
    nodes.append(node)

CPU times: user 26.7 s, sys: 2.48 s, total: 29.1 s
Wall time: 29.1 s

%%time

edges = []
for row in df_edges.itertuples():
    source_id = row.subject
    target_id = row.object
    edge_type = row.predicate
    edge_properties = {
        "id": row.id,
        "original_subject": row.original_subject,
        "original_object": row.original_object,
        "category": row.category,
        "agent_type": row.agent_type,
        "aggregator_knowledge_source": row.aggregator_knowledge_source,
        "knowledge_level": row.knowledge_level,
        "primary_knowledge_source": row.primary_knowledge_source,
        "qualifiers": row.qualifiers,
        "provided_by": row.provided_by,
        "has_evidence": row.has_evidence,
        "publications": row.publications,
        "stage_qualifier": row.stage_qualifier,
        "frequency_qualifier": row.frequency_qualifier,
        "has_count": row.has_count,
        "has_percentage": row.has_percentage,
        "has_quotient": row.has_quotient,
        "has_total": row.has_total,
        "negated": row.negated,
        "onset_qualifier": row.onset_qualifier,
        "sex_qualifier": row.sex_qualifier,
    }
    edge = (source_id, target_id, edge_type, edge_properties)
    edges.append(edge)

CPU times: user 5min 36s, sys: 25.4 s, total: 6min 1s
Wall time: 6min 1s

%%time

nodes_csv_filepath = shared_bmkg.export_nodes_as_csv(nodes, results_dir, project_name)

CPU times: user 43.3 s, sys: 1.66 s, total: 44.9 s
Wall time: 45 s

%%time

edges_csv_filepath = shared_bmkg.export_edges_as_csv(edges, results_dir, project_name)

CPU times: user 12min 13s, sys: 33.6 s, total: 12min 46s
Wall time: 12min 48s

%%time

g = shared_bmkg.create_graph(nodes, edges)

CPU times: user 2min 58s, sys: 20 s, total: 3min 18s
Wall time: 3min 17s

shared_bmkg.report_graph_stats(g)

Directed multigraph with 1028155 nodes, 11076689 edges and a density of 1.048e-05.

# Correctness checks

# 1) Does the reconstructed graph contain the same number of nodes as the raw data?
num_nodes_in_graph = g.vcount()
assert num_nodes_in_graph == num_nodes, f"Node counts differ: {num_nodes_in_graph} != {num_nodes}"
print(f"{num_nodes_in_graph:,} = {num_nodes:,}")

# 2) Does the reconstructed graph contain the same number of (unique) edges as the raw data?
num_edges_in_graph = g.ecount()
assert num_edges_in_graph == num_edges, f"Edge counts differ: {num_edges_in_graph} != {num_unique_triples}"
print(f"{num_edges_in_graph:,} = {num_edges:,}")

1,028,155 = 1,028,155
11,076,689 = 11,076,689

%%time

g_graphml_filepath = shared_bmkg.export_graph_as_graphml(g, results_dir, project_name)

CPU times: user 4min 27s, sys: 25.4 s, total: 4min 52s
Wall time: 4min 50s

# Drug: Imatinib - seems not to be contained in Monarch
shared_bmkg.list_nodes_matching_substring(g, "imatinib", "label")

id                        type               label                               
=================================================================================
Reactome:R-HSA-9669917    biolink:Pathway    Imatinib-resistant KIT mutants      
Reactome:R-HSA-9674396    biolink:Pathway    Imatinib-resistant PDGFR mutants

# Gene: ABL1
shared_bmkg.list_nodes_matching_substring(g, "abl1", "label")

id                          type                label                                                                                                                                                                                                                                        
=============================================================================================================================================================================================================================================================================================
HGNC:76                     biolink:Gene        ABL1                                                                                                                                                                                                                                         
MGI:2653892                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup> Abl2<sup>tm1Ajk</sup>/Abl2<sup>+</sup>  [background:] involves: 129S/SvEv * 129S4/SvJae * C57BL/6J                                                                                               
MGI:2653894                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>+</sup> Abl2<sup>tm1Ajk</sup>/Abl2<sup>tm1Ajk</sup>  [background:] involves: 129S/SvEv * 129S4/SvJae * C57BL/6J                                                                                               
MGI:2653897                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup> Abl2<sup>tm1Ajk</sup>/Abl2<sup>tm1Ajk</sup>  [background:] involves: 129S/SvEv * 129S4/SvJae * C57BL/6J                                                                                          
MGI:2665033                 biolink:Genotype    Tg(Igh-Abl1)40Sco/0  [background:] involves: C57BL/6JWehi * SJL/JWehi                                                                                                                                                                        
MGI:2665034                 biolink:Genotype    Tg(Igh-Abl1)40Sco/0 Tg(IghMyc)22Bri/0  [background:] involves: C57BL/6 * C57BL/6JWehi * SJL/J * SJL/JWehi                                                                                                                                    
MGI:3574537                 biolink:Genotype    Dok1<sup>tm1Yyam</sup>/Dok1<sup>tm1Yyam</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: C57BL/6                                                                                                                                        
MGI:3574538                 biolink:Genotype    Dok1<sup>tm1Yyam</sup>/Dok1<sup>tm1Yyam</sup> Dok2<sup>tm1Yyam</sup>/Dok2<sup>tm1Yyam</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: C57BL/6                                                                                          
MGI:3574539                 biolink:Genotype    Dok2<sup>tm1Yyam</sup>/Dok2<sup>tm1Yyam</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: C57BL/6                                                                                                                                        
MGI:3574541                 biolink:Genotype    Dok1<sup>tm1Ppp</sup>/Dok1<sup>tm1Ppp</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: 129S1/Sv                                                                                                                                         
MGI:3574544                 biolink:Genotype    Dok2<sup>tm1Ppp</sup>/Dok2<sup>tm1Ppp</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: 129S1/Sv                                                                                                                                         
MGI:3574545                 biolink:Genotype    Dok1<sup>tm1Ppp</sup>/Dok1<sup>+</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: 129S1/Sv                                                                                                                                              
MGI:3574546                 biolink:Genotype    Dok2<sup>tm1Ppp</sup>/Dok2<sup>+</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: 129S1/Sv                                                                                                                                              
MGI:3583883                 biolink:Genotype    Abl1<sup>tm1Ajk</sup>/Abl1<sup>tm1Ajk</sup> Abl2<sup>tm1Ajk</sup>/Abl2<sup>tm1Ajk</sup> Tg(Nes-cre)1Kag/?  [background:] involves: 129S4/SvJae                                                                                               
MGI:3693360                 biolink:Genotype    Tg(tetO-BCR/ABL1)27Dgt/0 Tg(MMTVtTA)1Mam/0  [background:] involves: C57BL/6 * FVB/N * SJL                                                                                                                                                    
MGI:3693361                 biolink:Genotype    Tg(tetO-BCR/ABL1)2Dgt/0 Tg(MMTVtTA)1Mam/0  [background:] involves: C57BL/6 * FVB/N * SJL                                                                                                                                                     
MGI:3693373                 biolink:Genotype    Tg(tetO-BCR/ABL1)2Dgt/0 Tg(Tal1-tTA)19Dgt/0  [background:] involves: C57BL/6 * DBA/2 * FVB/N                                                                                                                                                 
MGI:3828503                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup>  [background:] involves: 129S/SvEv * C57BL/6J                                                                                                                                                    
MGI:3834480                 biolink:Genotype    Tg(Ly6a-BCR/ABL1)IS1AIsg/0 Tg(Ly6a-BCR/ABL1)IS1BIsg/0  [background:] involves: C57BL/6J * CBA                                                                                                                                                
MGI:3834481                 biolink:Genotype    Tg(Ly6a-BCR/ABL1)IS1AIsg/0  [background:] involves: C57BL/6J * CBA                                                                                                                                                                           
MGI:3834482                 biolink:Genotype    Tg(Ly6a-BCR/ABL1)IS1BIsg/0  [background:] involves: C57BL/6J * CBA                                                                                                                                                                           
MGI:3834483                 biolink:Genotype    Tg(Ly6a-TK,-BCR/ABL1)IS9AIsg/0  [background:] involves: C57BL/6J * CBA                                                                                                                                                                       
MGI:4354766                 biolink:Genotype    Cbl<sup>tm1Soga</sup>/Cbl<sup>tm1Soga</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: C57BL/6 * DBA/2                                                                                                                                  
MGI:4354767                 biolink:Genotype    Cbl<sup>tm1Soga</sup>/Cbl<sup>+</sup> Tg(Tec-BCR/ABL1)5Hhi/0  [background:] involves: C57BL/6 * DBA/2                                                                                                                                        
MGI:4357933                 biolink:Genotype    Abl1<sup>m1</sup>/Abl1<sup>+</sup>  [background:] involves: 129S/SvEv                                                                                                                                                                        
MGI:4357934                 biolink:Genotype    Abl1<sup>m1</sup>/Abl1<sup>m1</sup>  [background:] either: 129S/SvEv-Abl1<sup>m1</sup> or (involves: 129S/SvEv * CD-1) or (involves: 129S/SvEv * C57BL/6 * DBA/2)                                                                            
MGI:4357935                 biolink:Genotype    Abl1<sup>m1</sup>/Abl1<sup>m1</sup>  [background:] involves: 129S/SvEv                                                                                                                                                                       
MGI:4357938                 biolink:Genotype    Abl1<sup>m1</sup>/Abl1<sup>m1</sup>  [background:] involves: 129S/SvEv * C57BL/6                                                                                                                                                             
MGI:4361586                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup>  [background:] involves: 129S/SvEv * C57BL/6J * CBA                                                                                                                                              
MGI:4361587                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup> Tg(ACTB-Abl1*I)1Spg/0  [background:] involves: 129S/SvEv * C57BL/6J * CBA                                                                                                                        
MGI:4361588                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup> Tg(ACTB-Abl1*K290R)1Spg/0  [background:] involves: 129S/SvEv * C57BL/6J * CBA                                                                                                                    
MGI:4361589                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup> Tg(ACTB-Abl1*IV)1Spg/0  [background:] involves: 129S/SvEv * C57BL/6J * CBA                                                                                                                       
MGI:4361590                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup> Tg(ACTB-Abl1*I)1Spg/0 Tg(ACTB-Abl1*IV)1Spg/0  [background:] involves: 129S/SvEv * C57BL/6J * CBA                                                                                                 
MGI:4421538                 biolink:Genotype    Abl1<sup>tm1Goff</sup>/Abl1<sup>tm1Goff</sup>  [background:] involves: C57BL/6                                                                                                                                                               
MGI:4421539                 biolink:Genotype    Abl1<sup>tm1Goff</sup>/Abl1<sup>tm1Goff</sup> Tg(Myh6-cre)2182Mds/0  [background:] involves: C57BL/6                                                                                                                                         
MGI:4421541                 biolink:Genotype    Abl1<sup>tm1.1Goff</sup>/Abl1<sup>tm1.1Goff</sup>  [background:] involves: C57BL/6                                                                                                                                                           
MGI:4421542                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup>  [background:] 129S/SvEv-Abl1<sup>tm1Mlg</sup>                                                                                                                                                   
MGI:4421543                 biolink:Genotype    Abl1<sup>tm1Mlg</sup>/Abl1<sup>tm1Mlg</sup>  [background:] B6.129-Abl1<sup>tm1Mlg</sup>                                                                                                                                                      
MGI:4421544                 biolink:Genotype    Abl1<sup>m1</sup>/Abl1<sup>m1</sup>  [background:] B6.129-Abl1<sup>m1</sup>                                                                                                                                                                  
MGI:4850043                 biolink:Genotype    Abl1<sup>tm2.2Goff</sup>/Abl1<sup>tm2.2Goff</sup>  [background:] involves: 129P2/OlaHsd * C57BL/6J * FVB/N                                                                                                                                   
MGI:4850044                 biolink:Genotype    Abl1<sup>tm2.1Goff</sup>/Abl1<sup>tm2.1Goff</sup> Abl2<sup>tm1Ajk</sup>/Abl2<sup>tm1Ajk</sup> Tg(Nes-cre)1Kln/0  [background:] involves: 129 * C57BL/6 * C57BL/6J * SJL                                                                      
MGI:4850045                 biolink:Genotype    Abl1<sup>tm2.1Goff</sup>/Abl1<sup>tm2.1Goff</sup> Abl2<sup>tm1Ajk</sup>/Abl2<sup>tm1Ajk</sup> Tg(Atoh1-cre)1Bfri/0  [background:] involves: 129P2/OlaHsd * 129S4/SvJae * C57BL/6J * CBA                                                      
MGI:4939891                 biolink:Genotype    Abl1<sup>tm1Gcos</sup>/Abl1<sup>+</sup> Myf5<sup>tm2Tajb</sup>/Myf5<sup>tm2Tajb</sup> Myf6<sup>tm1Tajb</sup>/Myf6<sup>tm1Tajb</sup> Myod1<sup>tm2.1(icre)Glh</sup>/Myod1<sup>+</sup>  [background:] involves: 129S * 129X1/SvJ * C57BL/6J    
MGI:4939892                 biolink:Genotype    Abl1<sup>tm1.1Gcos</sup>/Abl1<sup>tm1.1Gcos</sup>  [background:] involves: 129S1/Sv * 129X1/SvJ                                                                                                                                              
MGI:4939893                 biolink:Genotype    Abl1<sup>tm1.1Gcos</sup>/Abl1<sup>+</sup>  [background:] involves: 129S1/Sv * 129X1/SvJ                                                                                                                                                      
MGI:5806781                 biolink:Genotype    Gt(ROSA)26Sor<sup>tm4(CAG-hsb5)Nki</sup>/Gt(ROSA)26Sor<sup>+</sup> Tg(Mx1-cre)1Cgn/0 Tg(Tal1-tTA)19Dgt/0 Tg(tetO-BCR/ABL1)2Dgt/0 TgTn(pb-sb-GrOnc)#aGsva/0  [background:] involves: 129P2/OlaHsd * C57BL/6 * CBA/J * DBA/2 * FVB/N           
MGI:5806784                 biolink:Genotype    Tg(Mx1-cre)1Cgn/0 Tg(Tal1-tTA)19Dgt/0 Tg(tetO-BCR/ABL1)2Dgt/0 TgTn(pb-sb-GrOnc)#aGsva/0  [background:] involves: C57BL/6 * CBA/J * DBA/2 * FVB/N                                                                                             
MGI:5806786                 biolink:Genotype    Gt(ROSA)26Sor<sup>tm4(CAG-hsb5)Nki</sup>/Gt(ROSA)26Sor<sup>+</sup> Tg(Mx1-cre)1Cgn/0 Tg(tetO-BCR/ABL1)2Dgt/0 TgTn(pb-sb-GrOnc)#aGsva/0  [background:] involves: 129P2/OlaHsd * C57BL/6 * CBA/J * FVB/N                                       
MGI:87859                   biolink:Gene        Abl1                                                                                                                                                                                                                                         
MONDO:0004653               biolink:Disease     atypical chronic myeloid leukemia, BCR-ABL1 negative                                                                                                                                                                                         
MONDO:0006115               biolink:Disease     blast phase chronic myelogenous leukemia, BCR-ABL1 positive                                                                                                                                                                                  
MONDO:0011996               biolink:Disease     chronic myelogenous leukemia, BCR-ABL1 positive                                                                                                                                                                                              
MONDO:0035112               biolink:Disease     acute myeloid leukemia with BCR-ABL1                                                                                                                                                                                                         
MONDO:0850161               biolink:Disease     B-lymphoblastic leukemia/lymphoma, BCR-ABL1–like                                                                                                                                                                                             
MONDO:0850449               biolink:Disease     mixed phenotype acute leukemia with BCR-ABL1                                                                                                                                                                                                 
NCBIGene:100524544          biolink:Gene        ABL1                                                                                                                                                                                                                                         
NCBIGene:417181             biolink:Gene        ABL1                                                                                                                                                                                                                                         
NCBIGene:491292             biolink:Gene        ABL1                                                                                                                                                                                                                                         
NCBIGene:540876             biolink:Gene        ABL1                                                                                                                                                                                                                                         
RGD:1584969                 biolink:Gene        Abl1                                                                                                                                                                                                                                         
Xenbase:XB-GENE-17344225    biolink:Gene        abl1.L                                                                                                                                                                                                                                       
Xenbase:XB-GENE-6054675     biolink:Gene        abl1                                                                                                                                                                                                                                         
Xenbase:XB-GENE-6487639     biolink:Gene        abl1.S                                                                                                                                                                                                                                       
ZFIN:ZDB-GENE-100812-9      biolink:Gene        abl1

# Disease: Chronic Myeloid Leukemia (CML)
shared_bmkg.list_nodes_matching_substring(g, "chronic myelogenous leukemia", "label")

id               type                         label                                                          
=============================================================================================================
HP:0005506       biolink:PhenotypicFeature    Chronic myelogenous leukemia                                   
MONDO:0006115    biolink:Disease              blast phase chronic myelogenous leukemia, BCR-ABL1 positive    
MONDO:0011996    biolink:Disease              chronic myelogenous leukemia, BCR-ABL1 positive

# Neighborhood of gene ABL1 - in Homo sapiens
source = "HGNC:76"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_abl1"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)

Directed multigraph with 528 nodes, 40417 edges and a density of 0.145.

# Neighborhood of disease CML
source = "MONDO:0011996"
subgraph = shared_bmkg.get_egocentric_subgraph(g, source)

# Export
filename = f"{project_name}_neighbors_cml"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source)

Directed multigraph with 44 nodes, 106 edges and a density of 0.05475.

# Paths from gene ABL1 to disease "myeloid leukemia"
source = "HGNC:76"
target = "MONDO:0004643"
subgraph = shared_bmkg.get_paths_subgraph(g, source, target)

# Export
filename = f"{project_name}_paths_abl1_to_ML"
shared_bmkg.export_graph_as_graphml(subgraph, results_dir, filename)
shared_bmkg.export_nodes_as_csv(nodes, results_dir, filename, subgraph)
shared_bmkg.export_edges_as_csv(edges, results_dir, filename, subgraph)

# Report
shared_bmkg.report_graph_stats(subgraph)
shared_bmkg.visualize_graph(subgraph, node_type_to_color, source, target)

Directed multigraph with 3 nodes, 2 edges and a density of 0.2222.

	id	category	name	xref	has_gene	in_taxon	in_taxon_label	provided_by	description	synonym	full_name	symbol	type	deprecated	iri	same_as
0	CLINVAR:586	biolink:SequenceVariant	NM_000277.2(PAH):c.1A>G (p.Met1Val)	CA114360	HGNC:8582	NCBITaxon:9606	Homo sapiens	clingen_variant_nodes	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	CLINVAR:102844	biolink:SequenceVariant	NM_000277.2(PAH):c.806delT (p.Ile269Thrfs)	CA229778	HGNC:8582	NCBITaxon:9606	Homo sapiens	clingen_variant_nodes	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

	id	category	name	xref	has_gene	in_taxon	in_taxon_label	provided_by	description	synonym	full_name	symbol	type	deprecated	iri	same_as
450246	MGI:7608104	biolink:Genotype	Crlf3<sup>em1Gtm</sup>/Crlf3<sup>+</sup> [bac...	NaN	NaN	NCBITaxon:10090	Mus musculus	alliance_genotype_nodes	NaN	NaN	NaN	NaN	genotype	NaN	NaN	NaN
450247	MGI:7608107	biolink:Genotype	Cdc23<sup>em1Lwa</sup>/Cdc23<sup>em1Lwa</sup> ...	NaN	NaN	NCBITaxon:10090	Mus musculus	alliance_genotype_nodes	NaN	NaN	NaN	NaN	genotype	NaN	NaN	NaN

	id	original_subject	predicate	original_object	category	agent_type	aggregator_knowledge_source	knowledge_level	primary_knowledge_source	publications	...	frequency_qualifier	has_count	has_percentage	has_quotient	has_total	negated	onset_qualifier	sex_qualifier	subject	object
0	3dfcb65a-26a2-11ef-ace6-e9678ebf82fc	NaN	biolink:has_phenotype	NaN	biolink:GenotypeToPhenotypicFeatureAssociation	manual_agent	infores:monarchinitiative	knowledge_assertion	infores:zfin	ZFIN:ZDB-PUB-060503-2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ZFIN:ZDB-FISH-150901-1	ZP:0000041
1	3dfcb65b-26a2-11ef-ace6-e9678ebf82fc	NaN	biolink:has_phenotype	NaN	biolink:GenotypeToPhenotypicFeatureAssociation	manual_agent	infores:monarchinitiative	knowledge_assertion	infores:zfin	ZFIN:ZDB-PUB-060503-2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	ZFIN:ZDB-FISH-150901-1	ZP:0000055

	id	original_subject	predicate	original_object	category	agent_type	aggregator_knowledge_source	knowledge_level	primary_knowledge_source	publications	...	frequency_qualifier	has_count	has_percentage	has_quotient	has_total	negated	onset_qualifier	sex_qualifier	subject	object
229025	uuid:3324568f-4007-11ef-89e7-6fe0be41fbbf	MESH:C426686	biolink:treats_or_applied_or_studied_to_treat	MESH:D020521	biolink:ChemicalToDiseaseOrPhenotypicFeatureAs...	manual_agent	infores:monarchinitiative	knowledge_assertion	infores:ctd	PMID:16305522\|PMID:17885258	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	CHEBI:65172	MONDO:0005098
229026	uuid:33245690-4007-11ef-89e7-6fe0be41fbbf	MESH:C426686	biolink:treats_or_applied_or_studied_to_treat	MESH:D054556	biolink:ChemicalToDiseaseOrPhenotypicFeatureAs...	manual_agent	infores:monarchinitiative	knowledge_assertion	infores:ctd	PMID:16123915\|PMID:16305522	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	CHEBI:65172	MONDO:0005399

Monarch¶

Table of contents¶

1. Setup¶

a) Import packages¶

b) Create data directories¶

2. Data download¶

All files provided by the project¶

Files needed to create the knowledge graph¶

3. Data import¶

4. Data inspection¶

a) Number of nodes and edges¶

b) Types of nodes and edges¶

c) Example entries¶

Nodes together with node annotations¶

Edges together with edge annotations¶

5. Schema discovery¶

6. Knowledge graph reconstruction¶

a) Convert the data into a standardized format¶

Nodes¶

Edges¶

b) Export the standardized data to two CSV files¶

c) Use the standardized data to build a graph¶

d) Export the graph to a GraphML file¶

7. Subgraph exploration¶

a) Search for interesting nodes¶

b) Explore the neighborhood of a chosen node¶

c) Find shortest paths between two chosen nodes¶