{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Bigrams\n",
    "\n",
    "This Jupyter notebook provides an example of using the Python package [gravis](https://pypi.org/project/gravis). The .ipynb file can be found [here](https://github.com/robert-haas/gravis/tree/master/examples).\n",
    "\n",
    "It uses the **Natural Language Toolkit (NLTK)** to extract **word bigrams** from a text and filter them by simple criteria to get a list of relevant ones. Each bigram is a pair of words, therefore a list of bigrams can be interpreted as directed graph: words as nodes, word pairs as edges, frequency of a word pair as edge width.\n",
    "\n",
    "## References\n",
    "\n",
    "- Wikipedia\n",
    "    - [n-Gram](https://en.wikipedia.org/wiki/N-gram)\n",
    "- NLTK\n",
    "    - [Accessing Text Corpora and Lexical Resources](https://www.nltk.org/book/ch02.html)\n",
    "    - [Collocations](https://www.nltk.org/howto/collocations.html)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import gravis as gv\n",
    "import networkx as nx\n",
    "import nltk"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Download text corpora, if not already done before\n",
    "nltk.download('gutenberg')\n",
    "nltk.download('stopwords')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def text_to_bigrams_and_counts(text, min_count=3):\n",
    "    # Text\n",
    "    known_texts = nltk.corpus.gutenberg.fileids()\n",
    "    if text not in known_texts:\n",
    "        message = 'Unknown text \"{}\".\\nPossible values: {}'.format(text, known_texts)\n",
    "        raise ValueError(message)\n",
    "    \n",
    "    # Words\n",
    "    words = [word.lower() for word in nltk.corpus.gutenberg.words(text)]\n",
    "    print('Number of words:', len(words))\n",
    "\n",
    "    # Bigrams\n",
    "    bigrams = list(nltk.bigrams(words))\n",
    "    print('Number of bigrams:', len(bigrams))\n",
    "\n",
    "    # Bigram counts\n",
    "    bigrams_counted = {}\n",
    "    for bg in bigrams:\n",
    "        try:\n",
    "            bigrams_counted[bg] += 1\n",
    "        except KeyError:\n",
    "            bigrams_counted[bg] = 1\n",
    "    print('Number of unique bigrams:', len(bigrams_counted))\n",
    "\n",
    "    # Relevant bigrams\n",
    "    stop_words = nltk.corpus.stopwords.words('english')\n",
    "    def include_bigram(bigram):\n",
    "        count = bigrams_counted[bigram]\n",
    "        if count < min_count:\n",
    "            return False\n",
    "        for word in bigram:\n",
    "            if len(word) <= 1:\n",
    "                return False\n",
    "            if word in stop_words:\n",
    "                return False\n",
    "            if not word.isalnum():\n",
    "                return False\n",
    "        return True\n",
    "\n",
    "    filtered_bigrams = [bg for bg in bigrams if include_bigram(bg)]\n",
    "    filtered_bigrams = list(set(filtered_bigrams))\n",
    "    print('Number of filtered bigrams:', len(filtered_bigrams))\n",
    "    \n",
    "    # Relevant bigrams with counts\n",
    "    filtered_bigrams_and_counts = {bg: bigrams_counted[bg] for bg in filtered_bigrams}\n",
    "    return filtered_bigrams_and_counts\n",
    "\n",
    "\n",
    "def bigram_counts_to_graph(bg_cnt):\n",
    "    graph = nx.DiGraph()\n",
    "    for bigram, count in bg_cnt.items():\n",
    "        word1, word2 = bigram\n",
    "        graph.add_edge(word1, word2, count=count)\n",
    "    for node_id in graph.nodes:\n",
    "        node = graph.nodes[node_id]\n",
    "        node['size'] = (graph.in_degree[node_id] + 1) * 3\n",
    "    print()\n",
    "    print('Graph with {} nodes and {} edges.'.format(len(graph.nodes), len(graph.edges)))\n",
    "    return graph"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "scrolled": false
   },
   "outputs": [],
   "source": [
    "for text in ['austen-emma.txt', 'carroll-alice.txt', 'melville-moby_dick.txt', 'shakespeare-caesar.txt']:\n",
    "    print(text)\n",
    "    print('-' * len(text))\n",
    "    bigrams_and_counts = text_to_bigrams_and_counts(text, min_count=5)\n",
    "    graph = bigram_counts_to_graph(bigrams_and_counts)\n",
    "    fig = gv.d3(\n",
    "        graph,\n",
    "        edge_size_data_source='count',\n",
    "        use_edge_size_normalization=True,\n",
    "        zoom_factor=0.5,\n",
    "    )\n",
    "    fig.display(inline=True)\n",
    "    print()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}