{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Bigrams\n", "\n", "This Jupyter notebook provides an example of using the Python package [gravis](https://pypi.org/project/gravis). The .ipynb file can be found [here](https://github.com/robert-haas/gravis/tree/master/examples).\n", "\n", "It uses the **Natural Language Toolkit (NLTK)** to extract **word bigrams** from a text and filter them by simple criteria to get a list of relevant ones. Each bigram is a pair of words, therefore a list of bigrams can be interpreted as directed graph: words as nodes, word pairs as edges, frequency of a word pair as edge width.\n", "\n", "## References\n", "\n", "- Wikipedia\n", " - [n-Gram](https://en.wikipedia.org/wiki/N-gram)\n", "- NLTK\n", " - [Accessing Text Corpora and Lexical Resources](https://www.nltk.org/book/ch02.html)\n", " - [Collocations](https://www.nltk.org/howto/collocations.html)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import gravis as gv\n", "import networkx as nx\n", "import nltk" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "# Download text corpora, if not already done before\n", "nltk.download('gutenberg')\n", "nltk.download('stopwords')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def text_to_bigrams_and_counts(text, min_count=3):\n", " # Text\n", " known_texts = nltk.corpus.gutenberg.fileids()\n", " if text not in known_texts:\n", " message = 'Unknown text \"{}\".\\nPossible values: {}'.format(text, known_texts)\n", " raise ValueError(message)\n", " \n", " # Words\n", " words = [word.lower() for word in nltk.corpus.gutenberg.words(text)]\n", " print('Number of words:', len(words))\n", "\n", " # Bigrams\n", " bigrams = list(nltk.bigrams(words))\n", " print('Number of bigrams:', len(bigrams))\n", "\n", " # Bigram counts\n", " bigrams_counted = {}\n", " for bg in bigrams:\n", " try:\n", " bigrams_counted[bg] += 1\n", " except KeyError:\n", " bigrams_counted[bg] = 1\n", " print('Number of unique bigrams:', len(bigrams_counted))\n", "\n", " # Relevant bigrams\n", " stop_words = nltk.corpus.stopwords.words('english')\n", " def include_bigram(bigram):\n", " count = bigrams_counted[bigram]\n", " if count < min_count:\n", " return False\n", " for word in bigram:\n", " if len(word) <= 1:\n", " return False\n", " if word in stop_words:\n", " return False\n", " if not word.isalnum():\n", " return False\n", " return True\n", "\n", " filtered_bigrams = [bg for bg in bigrams if include_bigram(bg)]\n", " filtered_bigrams = list(set(filtered_bigrams))\n", " print('Number of filtered bigrams:', len(filtered_bigrams))\n", " \n", " # Relevant bigrams with counts\n", " filtered_bigrams_and_counts = {bg: bigrams_counted[bg] for bg in filtered_bigrams}\n", " return filtered_bigrams_and_counts\n", "\n", "\n", "def bigram_counts_to_graph(bg_cnt):\n", " graph = nx.DiGraph()\n", " for bigram, count in bg_cnt.items():\n", " word1, word2 = bigram\n", " graph.add_edge(word1, word2, count=count)\n", " for node_id in graph.nodes:\n", " node = graph.nodes[node_id]\n", " node['size'] = (graph.in_degree[node_id] + 1) * 3\n", " print()\n", " print('Graph with {} nodes and {} edges.'.format(len(graph.nodes), len(graph.edges)))\n", " return graph" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "scrolled": false }, "outputs": [], "source": [ "for text in ['austen-emma.txt', 'carroll-alice.txt', 'melville-moby_dick.txt', 'shakespeare-caesar.txt']:\n", " print(text)\n", " print('-' * len(text))\n", " bigrams_and_counts = text_to_bigrams_and_counts(text, min_count=5)\n", " graph = bigram_counts_to_graph(bigrams_and_counts)\n", " fig = gv.d3(\n", " graph,\n", " edge_size_data_source='count',\n", " use_edge_size_normalization=True,\n", " zoom_factor=0.5,\n", " )\n", " fig.display(inline=True)\n", " print()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.12" } }, "nbformat": 4, "nbformat_minor": 2 }