Downstream analysis example

The following provides an example of how a knowledge graph that was downloaded and converted with the package kgw can be used in a downstream analysis. Similar and more extensive analyses can be found in the notebooks section of the repository awesome-biomedical-knowledge-graphs, which informed the design and implementation of kgw.

This notebook explores the HALD knowledge graph after it has been converted with kgw to 1) a file-based SQLite database (kg.sqlite) and 2) a MeTTa file for OpenCog Hyperon (kg_spo.metta). The latter contains the KG in a maximally concise semantic triples representation (subject, predicate, object), i.e. without any node and edge properties. In contrast, the SQLite database and other MeTTa representations capture the full KG including all properties.

Please note that the demonstration provided here is only a tiny sample of all possible queries that can be formulated against the KG in the languages SQL for SQLite and MeTTa for OpenCog Hyperon. Future demonstrations of downstream analyses may cover more complex tasks and answer more interesting questions.

Import packages

[1]:
import os
import sqlite3

import gravis as gv
import hyperon
import networkx as nx

Set filepaths

[2]:
filepath_sqlite = os.path.join("hald_v6", "results", "kg.sqlite")
filepath_metta = os.path.join("hald_v6", "results", "kg_spo.metta")

Define some helper functions

SQLite

[3]:
def load_sqlite_kg(filepath):
    conn = sqlite3.connect(filepath)
    return conn

def query_sqlite_kg(conn, query):
    cursor = conn.cursor()
    cursor.execute(query)
    rows = cursor.fetchall()
    return rows

OpenCog Hyperon

[4]:
def load_metta_kg(filepath):
    metta = hyperon.MeTTa()
    module = filepath.replace(".metta", "").replace("/", ":")
    metta.run(f"!(import! &kg {module})")
    return metta

Load the knowledge graph

SQLite

[5]:
sqlite_conn = load_sqlite_kg(filepath_sqlite)

OpenCog Hyperon

[6]:
metta = load_metta_kg(filepath_metta)

Query the knowledge graph

SQLite

[7]:
sql_query = "SELECT COUNT(*) FROM nodes"
query_sqlite_kg(sqlite_conn, sql_query)
[7]:
[(12257,)]
[8]:
sql_query = "SELECT COUNT(*) FROM edges"
query_sqlite_kg(sqlite_conn, sql_query)
[8]:
[(116495,)]
[9]:
sql_query = "SELECT (SELECT COUNT(*) FROM nodes), (SELECT COUNT(*) FROM edges)"
query_sqlite_kg(sqlite_conn, sql_query)
[9]:
[(12257, 116495)]

OpenCog Hyperon

[10]:
def define_count_function(metta):
    metta_query = """
    (= (count $x)
       (if (== $x ())
         0
         (+ (count (cdr-atom $x)) 1)))
    """
    metta.run(metta_query)

define_count_function(metta)
[11]:
metta_query = "!(count (collapse (match &kg (, $x (: $x NodeType)) $x)))"
#metta.run(metta_query)  # slow at the moment
[12]:
metta_query = "!(count (collapse (match &kg (, $x (: $x EdgeType)) $x)))"
#metta.run(metta_query)  # slow at the moment

2) Types of nodes and edges

SQLite

[13]:
sql_query = "SELECT type, COUNT(*) as cnt FROM nodes GROUP BY type ORDER BY cnt DESC;"
query_sqlite_kg(sqlite_conn, sql_query)
[13]:
[('Gene', 5624),
 ('Disease', 3501),
 ('Mutation', 2217),
 ('RNA', 388),
 ('Carbohydrate', 211),
 ('Lipid', 177),
 ('Peptide', 82),
 ('Protein', 29),
 ('Pharmaceutical Preparations', 15),
 ('Toxin', 13)]
[14]:
# Note: Limited to 20 most frequently occurring edge types because there are many in HALD
sql_query = "SELECT type, COUNT(*) as cnt FROM edges GROUP BY type ORDER BY cnt DESC LIMIT 20;"
query_sqlite_kg(sqlite_conn, sql_query)
[14]:
[('associated', 19110),
 ('include', 5542),
 ('increase', 2088),
 ('result', 2015),
 ('cause', 2006),
 ('lead', 1990),
 ('occur', 1600),
 ('characterized', 1468),
 ('develop', 1467),
 ('related', 1425),
 ('show', 1250),
 ('reduce', 1249),
 ('contribute', 1204),
 ('associate', 1204),
 ('caused', 1156),
 ('prevent', 966),
 ('compare', 964),
 ('found', 848),
 ('observed', 829),
 ('present', 761)]

OpenCog Hyperon

[15]:
# TBD

3) Neighborhood of a node

[16]:
def visualize_edges(edges, allow_multiedges=True):
    if allow_multiedges:
        g = nx.MultiDiGraph()
    else:
        g = nx.DiGraph()
    for s, p, o in edges:
        g.add_edge(s, o, hover=p)
    print(g)
    fig = gv.d3(
        g,
        node_hover_neighborhood=True,
        edge_curvature=0.1,
        many_body_force_strength=-2000,
    )
    return fig

SQLite

[17]:
def get_sqlite_neighborhood(node_id):
    query = f"""
    SELECT
        source_id, type, target_id
    FROM
        edges
    WHERE
        source_id = "{node_id}"
    OR
        target_id = "{node_id}"
    OR (
        source_id IN (
            SELECT target_id FROM edges WHERE source_id = "{node_id}"
            UNION
            SELECT source_id FROM edges WHERE target_id = "{node_id}"
        )
        AND target_id IN (
            SELECT target_id FROM edges WHERE source_id = "{node_id}"
            UNION
            SELECT source_id FROM edges WHERE target_id = "{node_id}"
        )
    )
    """
    edges = query_sqlite_kg(sqlite_conn, query)
    print(f'Found {len(edges)} edges attached to node with id "{node_id}"')
    return edges
[18]:
edges_sqlite = get_sqlite_neighborhood("Leukemia, Myelomonocytic, Chronic")
visualize_edges(edges_sqlite)
Found 126 edges attached to node with id "Leukemia, Myelomonocytic, Chronic"
MultiDiGraph with 9 nodes and 126 edges
[18]:
Details for selected element
General
App state
Display mode
Export
Data selection
Graph
Node label text
Edge label text
Node size
Minimum
Maximum
Edge size
Minimum
Maximum
Nodes
Visibility
Size
Scaling factor
Position
Drag behavior
Hover behavior
Node images
Visibility
Size
Scaling factor
Node labels
Visibility
Size
Scaling factor
Rotation
Angle
Edges
Visibility
Size
Scaling factor
Form
Curvature
Hover behavior
Edge labels
Visibility
Size
Scaling factor
Rotation
Angle
Layout algorithm
Simulation
Many-body force
Strength
Theta
Min
Max
Links force
Collision force
Radius
Strength
x-positioning force
Strength
y-positioning force
Strength
Centering force
[19]:
edges_sqlite = get_sqlite_neighborhood("Leukemia, Myelogenous, Chronic, BCR-ABL Positive")
visualize_edges(edges_sqlite, allow_multiedges=False)
Found 1535 edges attached to node with id "Leukemia, Myelogenous, Chronic, BCR-ABL Positive"
DiGraph with 29 nodes and 213 edges
[19]:
Details for selected element
General
App state
Display mode
Export
Data selection
Graph
Node label text
Edge label text
Node size
Minimum
Maximum
Edge size
Minimum
Maximum
Nodes
Visibility
Size
Scaling factor
Position
Drag behavior
Hover behavior
Node images
Visibility
Size
Scaling factor
Node labels
Visibility
Size
Scaling factor
Rotation
Angle
Edges
Visibility
Size
Scaling factor
Form
Curvature
Hover behavior
Edge labels
Visibility
Size
Scaling factor
Rotation
Angle
Layout algorithm
Simulation
Many-body force
Strength
Theta
Min
Max
Links force
Collision force
Radius
Strength
x-positioning force
Strength
y-positioning force
Strength
Centering force

OpenCog Hyperon

[20]:
def define_neighborhood_functions(metta):
    # Get neighbor nodes
    metta.run("(= (get-neighbor-nodes $s) (match &kg ($s $p $o) $o))")
    metta.run("(= (get-neighbor-nodes $o) (match &kg ($s $p $o) $s))")
    # Get both the center node and the neighbor noods
    metta.run("(= (get-all-nodes $x) $x)")
    metta.run("(= (get-all-nodes $x) (get-neighbor-nodes $x))")
    # Get one edge
    metta.run("(= (get-triples $s $o) (match &kg ($s $p $o) ($s , $p , $o)))")
    # Get edges between all nodes in the entire neighborhood
    metta.run("(= (get-neighborhood $x) (get-triples (get-all-nodes $x) (get-all-nodes $x)))")

# Must be run exactly once, otherwise each additional call duplicates the output
define_neighborhood_functions(metta)
[21]:
def get_metta_neighborhood(node_id):
    raw_result = metta.run(f'!(get-neighborhood "{node_id}")')
    edges = [eval(x) for x in set(str(row) for row in raw_result[0])]
    print(f'Found {len(edges)} edges attached to node with id "{node_id}"')
    return edges
[22]:
%%time

edges_metta = get_metta_neighborhood("Leukemia, Myelomonocytic, Chronic")
visualize_edges(edges_metta)
Found 126 edges attached to node with id "Leukemia, Myelomonocytic, Chronic"
MultiDiGraph with 9 nodes and 126 edges
CPU times: user 42min 27s, sys: 5.17 s, total: 42min 32s
Wall time: 42min 34s
[22]:
Details for selected element
General
App state
Display mode
Export
Data selection
Graph
Node label text
Edge label text
Node size
Minimum
Maximum
Edge size
Minimum
Maximum
Nodes
Visibility
Size
Scaling factor
Position
Drag behavior
Hover behavior
Node images
Visibility
Size
Scaling factor
Node labels
Visibility
Size
Scaling factor
Rotation
Angle
Edges
Visibility
Size
Scaling factor
Form
Curvature
Hover behavior
Edge labels
Visibility
Size
Scaling factor
Rotation
Angle
Layout algorithm
Simulation
Many-body force
Strength
Theta
Min
Max
Links force
Collision force
Radius
Strength
x-positioning force
Strength
y-positioning force
Strength
Centering force