Downstream analysis example
The following provides an example of how a knowledge graph that was downloaded and converted with the package kgw can be used in a downstream analysis. Similar and more extensive analyses can be found in the notebooks section of the repository awesome-biomedical-knowledge-graphs, which informed the design and implementation of kgw.
This notebook explores the HALD knowledge graph after it has been converted with kgw to 1) a file-based SQLite database (kg.sqlite
) and 2) a MeTTa file for OpenCog Hyperon (kg_spo.metta
). The latter contains the KG in a maximally concise semantic triples representation (subject, predicate, object), i.e. without any node and edge properties. In contrast, the SQLite database and other MeTTa representations capture the full KG including all
properties.
Please note that the demonstration provided here is only a tiny sample of all possible queries that can be formulated against the KG in the languages SQL for SQLite and MeTTa for OpenCog Hyperon. Future demonstrations of downstream analyses may cover more complex tasks and answer more interesting questions.
Import packages
[1]:
import os
import sqlite3
import gravis as gv
import hyperon
import networkx as nx
Set filepaths
[2]:
filepath_sqlite = os.path.join("hald_v6", "results", "kg.sqlite")
filepath_metta = os.path.join("hald_v6", "results", "kg_spo.metta")
Define some helper functions
SQLite
[3]:
def load_sqlite_kg(filepath):
conn = sqlite3.connect(filepath)
return conn
def query_sqlite_kg(conn, query):
cursor = conn.cursor()
cursor.execute(query)
rows = cursor.fetchall()
return rows
OpenCog Hyperon
[4]:
def load_metta_kg(filepath):
metta = hyperon.MeTTa()
module = filepath.replace(".metta", "").replace("/", ":")
metta.run(f"!(import! &kg {module})")
return metta
Load the knowledge graph
SQLite
[5]:
sqlite_conn = load_sqlite_kg(filepath_sqlite)
OpenCog Hyperon
[6]:
metta = load_metta_kg(filepath_metta)
Query the knowledge graph
SQLite
[7]:
sql_query = "SELECT COUNT(*) FROM nodes"
query_sqlite_kg(sqlite_conn, sql_query)
[7]:
[(12257,)]
[8]:
sql_query = "SELECT COUNT(*) FROM edges"
query_sqlite_kg(sqlite_conn, sql_query)
[8]:
[(116495,)]
[9]:
sql_query = "SELECT (SELECT COUNT(*) FROM nodes), (SELECT COUNT(*) FROM edges)"
query_sqlite_kg(sqlite_conn, sql_query)
[9]:
[(12257, 116495)]
OpenCog Hyperon
[10]:
def define_count_function(metta):
metta_query = """
(= (count $x)
(if (== $x ())
0
(+ (count (cdr-atom $x)) 1)))
"""
metta.run(metta_query)
define_count_function(metta)
[11]:
metta_query = "!(count (collapse (match &kg (, $x (: $x NodeType)) $x)))"
#metta.run(metta_query) # slow at the moment
[12]:
metta_query = "!(count (collapse (match &kg (, $x (: $x EdgeType)) $x)))"
#metta.run(metta_query) # slow at the moment
2) Types of nodes and edges
SQLite
[13]:
sql_query = "SELECT type, COUNT(*) as cnt FROM nodes GROUP BY type ORDER BY cnt DESC;"
query_sqlite_kg(sqlite_conn, sql_query)
[13]:
[('Gene', 5624),
('Disease', 3501),
('Mutation', 2217),
('RNA', 388),
('Carbohydrate', 211),
('Lipid', 177),
('Peptide', 82),
('Protein', 29),
('Pharmaceutical Preparations', 15),
('Toxin', 13)]
[14]:
# Note: Limited to 20 most frequently occurring edge types because there are many in HALD
sql_query = "SELECT type, COUNT(*) as cnt FROM edges GROUP BY type ORDER BY cnt DESC LIMIT 20;"
query_sqlite_kg(sqlite_conn, sql_query)
[14]:
[('associated', 19110),
('include', 5542),
('increase', 2088),
('result', 2015),
('cause', 2006),
('lead', 1990),
('occur', 1600),
('characterized', 1468),
('develop', 1467),
('related', 1425),
('show', 1250),
('reduce', 1249),
('contribute', 1204),
('associate', 1204),
('caused', 1156),
('prevent', 966),
('compare', 964),
('found', 848),
('observed', 829),
('present', 761)]
OpenCog Hyperon
[15]:
# TBD
3) Neighborhood of a node
[16]:
def visualize_edges(edges, allow_multiedges=True):
if allow_multiedges:
g = nx.MultiDiGraph()
else:
g = nx.DiGraph()
for s, p, o in edges:
g.add_edge(s, o, hover=p)
print(g)
fig = gv.d3(
g,
node_hover_neighborhood=True,
edge_curvature=0.1,
many_body_force_strength=-2000,
)
return fig
SQLite
[17]:
def get_sqlite_neighborhood(node_id):
query = f"""
SELECT
source_id, type, target_id
FROM
edges
WHERE
source_id = "{node_id}"
OR
target_id = "{node_id}"
OR (
source_id IN (
SELECT target_id FROM edges WHERE source_id = "{node_id}"
UNION
SELECT source_id FROM edges WHERE target_id = "{node_id}"
)
AND target_id IN (
SELECT target_id FROM edges WHERE source_id = "{node_id}"
UNION
SELECT source_id FROM edges WHERE target_id = "{node_id}"
)
)
"""
edges = query_sqlite_kg(sqlite_conn, query)
print(f'Found {len(edges)} edges attached to node with id "{node_id}"')
return edges
[18]:
edges_sqlite = get_sqlite_neighborhood("Leukemia, Myelomonocytic, Chronic")
visualize_edges(edges_sqlite)
Found 126 edges attached to node with id "Leukemia, Myelomonocytic, Chronic"
MultiDiGraph with 9 nodes and 126 edges
[18]:
[19]:
edges_sqlite = get_sqlite_neighborhood("Leukemia, Myelogenous, Chronic, BCR-ABL Positive")
visualize_edges(edges_sqlite, allow_multiedges=False)
Found 1535 edges attached to node with id "Leukemia, Myelogenous, Chronic, BCR-ABL Positive"
DiGraph with 29 nodes and 213 edges
[19]:
OpenCog Hyperon
[20]:
def define_neighborhood_functions(metta):
# Get neighbor nodes
metta.run("(= (get-neighbor-nodes $s) (match &kg ($s $p $o) $o))")
metta.run("(= (get-neighbor-nodes $o) (match &kg ($s $p $o) $s))")
# Get both the center node and the neighbor noods
metta.run("(= (get-all-nodes $x) $x)")
metta.run("(= (get-all-nodes $x) (get-neighbor-nodes $x))")
# Get one edge
metta.run("(= (get-triples $s $o) (match &kg ($s $p $o) ($s , $p , $o)))")
# Get edges between all nodes in the entire neighborhood
metta.run("(= (get-neighborhood $x) (get-triples (get-all-nodes $x) (get-all-nodes $x)))")
# Must be run exactly once, otherwise each additional call duplicates the output
define_neighborhood_functions(metta)
[21]:
def get_metta_neighborhood(node_id):
raw_result = metta.run(f'!(get-neighborhood "{node_id}")')
edges = [eval(x) for x in set(str(row) for row in raw_result[0])]
print(f'Found {len(edges)} edges attached to node with id "{node_id}"')
return edges
[22]:
%%time
edges_metta = get_metta_neighborhood("Leukemia, Myelomonocytic, Chronic")
visualize_edges(edges_metta)
Found 126 edges attached to node with id "Leukemia, Myelomonocytic, Chronic"
MultiDiGraph with 9 nodes and 126 edges
CPU times: user 42min 27s, sys: 5.17 s, total: 42min 32s
Wall time: 42min 34s
[22]: