diff --git a/.gitignore b/.gitignore index 6c6ed9f..bb2f05d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,5 @@ .history .project .pydevproject +.DS_Store +.idea/ diff --git a/README.md b/README.md index 0a34435..8c4d21f 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,3 @@ -# evidence_extractor +# CoSID Linkage Scripts + Python scripts to run evidence extraction pipelines from text annotated with discourse tags from SciDT diff --git a/notebooks/ In-Paper Sentence Similarity .ipynb b/notebooks/ In-Paper Sentence Similarity .ipynb new file mode 100644 index 0000000..47507fd --- /dev/null +++ b/notebooks/ In-Paper Sentence Similarity .ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Methods to compute similarity scores from word embeddings\n", + "\n", + "This is taken from http://vene.ro/blog/word-movers-distance-in-python.html" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split\n", + "\n", + "def read_sentences_from_tsv(tsv):\n", + " str_seqs = []\n", + " str_seq = []\n", + " label_seqs = []\n", + " label_seq = []\n", + " pmids = {}\n", + " \n", + " old_sentence = ''\n", + " for idx,row in tsv.iterrows():\n", + " clause = row['Clause Text']\n", + " paragraph = row['Paragraph']\n", + " sentence = row['SentenceId']\n", + " discourse = row['Discourse Type']\n", + "\n", + " if sentence != old_sentence:\n", + " if len(str_seq) != 0:\n", + " str_seqs.append(str_seq)\n", + " str_seq = []\n", + " label_seqs.append(label_seq)\n", + " label_seq = []\n", + "\n", + " str_seq.append(clause)\n", + " label_seq.append(discourse.strip())\n", + " old_sentence = sentence\n", + " \n", + " if len(str_seq) != 0:\n", + " str_seqs.append(str_seq)\n", + " str_seq = []\n", + " label_seqs.append(label_seq)\n", + " label_seq = []\n", + " \n", + " return str_seqs, label_seqs\n", + "\n", + "data_dir = \"/tmp/data/scidt_tsv_annotated\"\n", + "\n", + "data = {}\n", + "files = []\n", + "clauses = []\n", + "for fn in os.listdir(data_dir):\n", + " if os.path.isfile(data_dir+'/'+fn) and fn[-4:]=='.tsv' :\n", + " pmid = fn[:len(fn)-10]\n", + " files.append(data_dir+ \"/\" + fn)\n", + " tsv = pd.read_csv(data_dir+'/'+fn, sep='\\t')\n", + " data_from_pmid = read_sentences_from_tsv( tsv )\n", + " data[pmid] = data_from_pmid \n", + " \n", + "print len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "({'9971737': ([['to act as important elements in integrin-mediated signaling ( Schlaepfer et al. , 1998 ) .']], [['result']]), '10085298': ([['to regulate actin fiber assembly and cytokinesis in both yeast ( Li , 1997 ) and mammalian ( Symons et al. , 1996 ) cells .']], [['result']]), '19734906': ([['addressing this question .']], [['goal']]), '15314656': ([['as described above .'], ['Cells were incubated in growth medium containing 1 mug/ml colcemid for 16 h .']], [['method'], ['method']]), '16848641': ([['uninfected or were infected for 5 h with F. novicida .'], ['Cells were lysed in 100 mul of Luciferase Cell Culture Lysis Reagent ( Promega , Madison , Wisconsin , United States ) .']], [['method'], ['method']]), '18604198': ([[\"and significance was determined using Student 's 2-tailed unpaired t-test .\"], ['Viability Assays']], [['method'], ['none']]), '18583988': ([['Melander F , Bekker-Jensen S , Falck J , Bartek J , Mailand N , Lukas J ( 2008 ) Phosphorylation of SDT repeats in the MDC1 N terminus triggers retention of NBS1 at the DNA damage-modified chromatin .']], [['none']]), '11238593': ([['as assessed by immunoprecipitation was not affected by the aly mutation ( our unpublished observation ) , although the aly mutation resides in a putative TRAF-binding domain of NIK 31 .']], [['result']]), '10790433': ([['that abnormal alterations of the phosphorylation status of PAG under certain pathological conditions ( e.g. , at sites of local inflammation ) and thus decreased Csk binding may contribute to hyperactivity of T cells ( and other leukocytes as well ) .'], ['This could be a factor contributing to development of some autoimmune disorders , for example .'], ['The phosphatase ( s ) involved in PAG dephosphorylation might thus be an interesting target for pharmacological intervention .']], [['hypothesis'], ['hypothesis'], ['hypothesis']]), '9128250': ([['that phospholipids and/or Gbetagamma subunits are involved in their intracellular localization and/or activation .']], [['hypothesis']]), '17276402': ([['that VEGF stimulates the transient activation of AMPK in cultured endothelial cells in a PLC - and CaMKK-dependent manner .'], ['AMPK , therefore , represents a novel component of VEGF signalling .']], [['implication'], ['implication']]), '10704436': ([['that nuclear export of MAPK involves a MAPKK-dependent , active transport mechanism .'], ['Because inactivated MAPK preferentially binds to MAPKK , MAPK , once deactivated in the nucleus , would be rapidly excluded from the nucleus .']], [['implication'], ['hypothesis']]), '16729043': ([['as significant .']], [['result']]), '9625767': ([['that p38 and other MAP kinases are involved in TNF-induced IL-6 gene expression via modulation of transactivation potential of NF-kB without affecting its DNA binding activity ( 37 ) .']], [['implication']]), '16602827': ([['it does not inhibit the STF reporter in 293Fz4 cells .']], [['result']]), '9700154': ([['providing an indication that inactive MAPK can bind to the nuclear anchor ( s ) .'], ['One of the next steps in our future investigation of MAPK nuclear signaling will clearly be the identification of the nuclear anchoring components and the analysis of the MAPK activity from the nuclear pool .']], [['implication'], ['goal']])}, [['as had been suggested for v-Src ( Fincham et al. , 1995 ; Hildebrand et al. , 1993 ) .']], [['hypothesis']])\n" + ] + } + ], + "source": [ + "vect = CountVectorizer(stop_words=\"english\").fit([d1, d2])\n", + "print(\"Features:\", \", \".join(vect.get_feature_names()))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(3254, 50)\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "vectorizer = TfidfVectorizer(max_df=0.5, \n", + " max_features=50,\n", + " min_df=2, \n", + " stop_words='english'\n", + " )\n", + "X = vectorizer.fit_transform(sentences)\n", + "print X.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialization complete\n", + "Iteration 0, inertia 5135.651\n", + "Iteration 1, inertia 2858.126\n", + "Iteration 2, inertia 2821.521\n", + "Iteration 3, inertia 2805.059\n", + "Iteration 4, inertia 2796.352\n", + "Iteration 5, inertia 2791.557\n", + "Iteration 6, inertia 2788.768\n", + "Iteration 7, inertia 2786.581\n", + "Iteration 8, inertia 2785.083\n", + "Iteration 9, inertia 2784.345\n", + "Iteration 10, inertia 2783.780\n", + "Iteration 11, inertia 2783.254\n", + "Iteration 12, inertia 2782.792\n", + "Iteration 13, inertia 2782.444\n", + "Iteration 14, inertia 2782.153\n", + "Iteration 15, inertia 2781.636\n", + "Iteration 16, inertia 2781.422\n", + "Iteration 17, inertia 2781.188\n", + "Iteration 18, inertia 2780.980\n", + "Iteration 19, inertia 2780.888\n", + "Iteration 20, inertia 2780.837\n", + "Iteration 21, inertia 2780.752\n", + "Iteration 22, inertia 2780.650\n", + "Iteration 23, inertia 2780.616\n", + "Converged at iteration 23\n", + "done in 0.425s\n", + "Cluster 0:\n" + ] + }, + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Cluster %d:\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mind\u001b[0m \u001b[1;32min\u001b[0m \u001b[0morder_centroids\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[1;34m'%d %s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mind\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mterms\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mind\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.DS_Store b/notebooks/.DS_Store new file mode 100644 index 0000000..42526df Binary files /dev/null and b/notebooks/.DS_Store differ diff --git a/notebooks/.ipynb_checkpoints/ In-Paper Sentence Similarity -checkpoint.ipynb b/notebooks/.ipynb_checkpoints/ In-Paper Sentence Similarity -checkpoint.ipynb new file mode 100644 index 0000000..47507fd --- /dev/null +++ b/notebooks/.ipynb_checkpoints/ In-Paper Sentence Similarity -checkpoint.ipynb @@ -0,0 +1,219 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Methods to compute similarity scores from word embeddings\n", + "\n", + "This is taken from http://vene.ro/blog/word-movers-distance-in-python.html" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "16\n" + ] + } + ], + "source": [ + "import os\n", + "import pandas as pd\n", + "import numpy as np\n", + "\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split\n", + "\n", + "def read_sentences_from_tsv(tsv):\n", + " str_seqs = []\n", + " str_seq = []\n", + " label_seqs = []\n", + " label_seq = []\n", + " pmids = {}\n", + " \n", + " old_sentence = ''\n", + " for idx,row in tsv.iterrows():\n", + " clause = row['Clause Text']\n", + " paragraph = row['Paragraph']\n", + " sentence = row['SentenceId']\n", + " discourse = row['Discourse Type']\n", + "\n", + " if sentence != old_sentence:\n", + " if len(str_seq) != 0:\n", + " str_seqs.append(str_seq)\n", + " str_seq = []\n", + " label_seqs.append(label_seq)\n", + " label_seq = []\n", + "\n", + " str_seq.append(clause)\n", + " label_seq.append(discourse.strip())\n", + " old_sentence = sentence\n", + " \n", + " if len(str_seq) != 0:\n", + " str_seqs.append(str_seq)\n", + " str_seq = []\n", + " label_seqs.append(label_seq)\n", + " label_seq = []\n", + " \n", + " return str_seqs, label_seqs\n", + "\n", + "data_dir = \"/tmp/data/scidt_tsv_annotated\"\n", + "\n", + "data = {}\n", + "files = []\n", + "clauses = []\n", + "for fn in os.listdir(data_dir):\n", + " if os.path.isfile(data_dir+'/'+fn) and fn[-4:]=='.tsv' :\n", + " pmid = fn[:len(fn)-10]\n", + " files.append(data_dir+ \"/\" + fn)\n", + " tsv = pd.read_csv(data_dir+'/'+fn, sep='\\t')\n", + " data_from_pmid = read_sentences_from_tsv( tsv )\n", + " data[pmid] = data_from_pmid \n", + " \n", + "print len(data)" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "({'9971737': ([['to act as important elements in integrin-mediated signaling ( Schlaepfer et al. , 1998 ) .']], [['result']]), '10085298': ([['to regulate actin fiber assembly and cytokinesis in both yeast ( Li , 1997 ) and mammalian ( Symons et al. , 1996 ) cells .']], [['result']]), '19734906': ([['addressing this question .']], [['goal']]), '15314656': ([['as described above .'], ['Cells were incubated in growth medium containing 1 mug/ml colcemid for 16 h .']], [['method'], ['method']]), '16848641': ([['uninfected or were infected for 5 h with F. novicida .'], ['Cells were lysed in 100 mul of Luciferase Cell Culture Lysis Reagent ( Promega , Madison , Wisconsin , United States ) .']], [['method'], ['method']]), '18604198': ([[\"and significance was determined using Student 's 2-tailed unpaired t-test .\"], ['Viability Assays']], [['method'], ['none']]), '18583988': ([['Melander F , Bekker-Jensen S , Falck J , Bartek J , Mailand N , Lukas J ( 2008 ) Phosphorylation of SDT repeats in the MDC1 N terminus triggers retention of NBS1 at the DNA damage-modified chromatin .']], [['none']]), '11238593': ([['as assessed by immunoprecipitation was not affected by the aly mutation ( our unpublished observation ) , although the aly mutation resides in a putative TRAF-binding domain of NIK 31 .']], [['result']]), '10790433': ([['that abnormal alterations of the phosphorylation status of PAG under certain pathological conditions ( e.g. , at sites of local inflammation ) and thus decreased Csk binding may contribute to hyperactivity of T cells ( and other leukocytes as well ) .'], ['This could be a factor contributing to development of some autoimmune disorders , for example .'], ['The phosphatase ( s ) involved in PAG dephosphorylation might thus be an interesting target for pharmacological intervention .']], [['hypothesis'], ['hypothesis'], ['hypothesis']]), '9128250': ([['that phospholipids and/or Gbetagamma subunits are involved in their intracellular localization and/or activation .']], [['hypothesis']]), '17276402': ([['that VEGF stimulates the transient activation of AMPK in cultured endothelial cells in a PLC - and CaMKK-dependent manner .'], ['AMPK , therefore , represents a novel component of VEGF signalling .']], [['implication'], ['implication']]), '10704436': ([['that nuclear export of MAPK involves a MAPKK-dependent , active transport mechanism .'], ['Because inactivated MAPK preferentially binds to MAPKK , MAPK , once deactivated in the nucleus , would be rapidly excluded from the nucleus .']], [['implication'], ['hypothesis']]), '16729043': ([['as significant .']], [['result']]), '9625767': ([['that p38 and other MAP kinases are involved in TNF-induced IL-6 gene expression via modulation of transactivation potential of NF-kB without affecting its DNA binding activity ( 37 ) .']], [['implication']]), '16602827': ([['it does not inhibit the STF reporter in 293Fz4 cells .']], [['result']]), '9700154': ([['providing an indication that inactive MAPK can bind to the nuclear anchor ( s ) .'], ['One of the next steps in our future investigation of MAPK nuclear signaling will clearly be the identification of the nuclear anchoring components and the analysis of the MAPK activity from the nuclear pool .']], [['implication'], ['goal']])}, [['as had been suggested for v-Src ( Fincham et al. , 1995 ; Hildebrand et al. , 1993 ) .']], [['hypothesis']])\n" + ] + } + ], + "source": [ + "vect = CountVectorizer(stop_words=\"english\").fit([d1, d2])\n", + "print(\"Features:\", \", \".join(vect.get_feature_names()))" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(3254, 50)\n" + ] + } + ], + "source": [ + "from sklearn.feature_extraction.text import CountVectorizer\n", + "\n", + "vectorizer = TfidfVectorizer(max_df=0.5, \n", + " max_features=50,\n", + " min_df=2, \n", + " stop_words='english'\n", + " )\n", + "X = vectorizer.fit_transform(sentences)\n", + "print X.shape\n" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Initialization complete\n", + "Iteration 0, inertia 5135.651\n", + "Iteration 1, inertia 2858.126\n", + "Iteration 2, inertia 2821.521\n", + "Iteration 3, inertia 2805.059\n", + "Iteration 4, inertia 2796.352\n", + "Iteration 5, inertia 2791.557\n", + "Iteration 6, inertia 2788.768\n", + "Iteration 7, inertia 2786.581\n", + "Iteration 8, inertia 2785.083\n", + "Iteration 9, inertia 2784.345\n", + "Iteration 10, inertia 2783.780\n", + "Iteration 11, inertia 2783.254\n", + "Iteration 12, inertia 2782.792\n", + "Iteration 13, inertia 2782.444\n", + "Iteration 14, inertia 2782.153\n", + "Iteration 15, inertia 2781.636\n", + "Iteration 16, inertia 2781.422\n", + "Iteration 17, inertia 2781.188\n", + "Iteration 18, inertia 2780.980\n", + "Iteration 19, inertia 2780.888\n", + "Iteration 20, inertia 2780.837\n", + "Iteration 21, inertia 2780.752\n", + "Iteration 22, inertia 2780.650\n", + "Iteration 23, inertia 2780.616\n", + "Converged at iteration 23\n", + "done in 0.425s\n", + "Cluster 0:\n" + ] + }, + { + "ename": "IndexError", + "evalue": "list index out of range", + "output_type": "error", + "traceback": [ + "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[1;31mIndexError\u001b[0m Traceback (most recent call last)", + "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[1;32mprint\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"Cluster %d:\"\u001b[0m \u001b[1;33m%\u001b[0m \u001b[0mi\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 14\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mind\u001b[0m \u001b[1;32min\u001b[0m \u001b[0morder_centroids\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mi\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m:\u001b[0m\u001b[1;36m10\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 15\u001b[1;33m \u001b[1;32mprint\u001b[0m \u001b[1;34m'%d %s'\u001b[0m \u001b[1;33m%\u001b[0m \u001b[1;33m(\u001b[0m\u001b[0mind\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mterms\u001b[0m\u001b[1;33m[\u001b[0m\u001b[0mind\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 16\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n", + "\u001b[1;31mIndexError\u001b[0m: list index out of range" + ] + } + ], + "source": [ + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.ipynb_checkpoints/05-02-2017 Word Movers Distance Measures Based on Experiments-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/05-02-2017 Word Movers Distance Measures Based on Experiments-checkpoint.ipynb new file mode 100644 index 0000000..e704bd5 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/05-02-2017 Word Movers Distance Measures Based on Experiments-checkpoint.ipynb @@ -0,0 +1,1267 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Similarity between experiments in INTACT papers\n", + "\n", + "Here we examine the analysis of text tagged for different experiments to see if there is a discernable difference between the sentences from different experiments. \n", + "\n", + "We should also perhaps attempt to annotate for entities in each sentence as well." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import gensim\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0SentenceIdClause TextCodesExperimentValuesParagraphHeadingsFloatingBox?Discourse TypeOffset_BeginOffset_Endfig_spans
00s1Involvement of Pex13p in Pex14p Localization a...[]NaN-NaNFalsenone127248NaN
11s2Pex13p is the putative docking protein for per...[]NaNp3NaNFalsefact799922NaN
22s3Pex14p interacts with both the PTS1 - and PTS2...[]NaNp3NaNFalsefact9231077NaN
33s4We report the involvement of Pex13p in peroxis...[]NaNp3NaNFalsefact10781164NaN
44s5Like Pex14p ,[]NaNp3NaNFalsenone11651177NaN
55s5Pex13p not only interacts with the PTS1-recept...[]NaNp3NaNFalsefact11781325NaN
66s6In support of distinct peroxisomal binding sit...[]NaNp3NaNFalsehypothesis13261462NaN
77s6evidence for the interaction of Pex7p and Pex1...[]NaNp3NaNFalseimplication14631619NaN
88s7Accordingly , we conclude[]NaNp3NaNFalsehypothesis16201644NaN
99s7that Pex7p and Pex13p functionally interact du...[]NaNp3NaNFalsehypothesis16451743NaN
1010s8NH2-terminal regions of Pex13p are required fo...[]NaNp3NaNFalseimplication17441830NaN
1111s8while the COOH-terminal SH3 domain alone is su...[]NaNp3NaNFalseimplication18311936NaN
1212s9Reinvestigation of the topology revealed both ...[]NaNp3NaNFalseresult19372000NaN
1313s9to be oriented towards the cytosol .[]NaNp3NaNFalseimplication20012036NaN
1414s10We also found Pex13p to be required for peroxi...[]NaNp3NaNFalseresult20372115NaN
1515s10the SH3 domain of Pex13p may not provide the o...[]NaNp3NaNFalseimplication21162218NaN
1616s11Peroxisomal matrix proteins are synthesized on...[exLink]NaNp4NaNFalsefact22672419NaN
1717s12The presence of two distinct peroxisomal targe...[]NaNp4NaNFalsefact24202583NaN
1818s13PTS1 , present in the majority of peroxisomal ...[]NaNp4NaNFalsefact25842736NaN
1919s13for review see McNew and Goodman , 1996 ) .[exLink]NaNp4NaNFalsefact27372777NaN
2020s14Only one known peroxisomal matrix protein in S...[]NaNp4NaNFalsefact27782901NaN
2121s14which is typically localized close to the NH2 ...[exLink]NaNp4NaNFalsefact29023086NaN
2222s15Recognition of PTS1 and PTS2 targeting signals...[]NaNp5NaNFalsefact30873215NaN
2323s15for review see Subramani , 1996 ; Erdmann et a...[exLink]NaNp5NaNFalsefact32163270NaN
2424s16Cells deficient in either protein display part...[]NaNp5NaNFalsefact32713341NaN
2525s17pex5Delta cells correctly localize PTS2 protei...[]NaNp5NaNFalsefact33423444NaN
2626s18pex7Delta cells exhibit the reverse phenotype[]NaNp5NaNFalseresult34453491NaN
2727s18for review see Elgersma and Tabak , 1996 ) .[exLink]NaNp5NaNFalseresult34923533NaN
2828s19The intracellular localization of both targeti...[]NaNp5NaNFalseimplication35343628NaN
2929s20A predominantly cytosolic , membrane-bound , a...[]NaNp5NaNFalseresult36293749NaN
.......................................
413413s263a Pex5p/Pex7p two-hybrid interaction is not ob...[inLink]f1p71DiscussionFalseresult4060440681NaN
414414s264At first , this observation seems rather surpr...[]NaNp71DiscussionFalseimplication4068240733NaN
415415s264since both Pex5p and Pex7p independently inter...[inLink]f4p71DiscussionFalseresult4073440830NaN
416416s265One could imagine[]NaNp71DiscussionFalsehypothesis4083140848NaN
417417s265that Pex13p may serve as a bridging molecule b...[]NaNp71DiscussionFalsehypothesis4084941004NaN
418418s266However , the amount of Pex5p simultaneously a...[]NaNp71DiscussionFalsehypothesis4100541104NaN
419419s266to give a positive response .[]NaNp71DiscussionFalsehypothesis4110541133NaN
420420s267In support of this assumption , the amount of ...[inLink]f3p71DiscussionFalseimplication4113441345NaN
421421s268Perhaps Pex13p does not usually associate simu...[]NaNp71DiscussionFalseimplication4134641436NaN
422422s268or association is transient .[]NaNp71DiscussionFalseresult4143741466NaN
423423s269The domain of Pex13 that interacts with Pex7p ...[]NaNp72DiscussionFalsefact4146741561NaN
424424s269where the interaction occurs , remains unknown .[]NaNp72DiscussionFalsefact4156241608NaN
425425s270Furthermore , the intracellular localization o...[]NaNp72DiscussionFalsefact4160941699NaN
426426s271One group has reported[]NaNp72DiscussionFalseresult4170041722NaN
427427s271that the protein is exclusively localized in t...[exLink]NaNp72DiscussionFalseresult4172342008NaN
428428s272Because the SH3 domain alone does not mediate ...[]NaNp72DiscussionFalseimplication4200942093NaN
429429s272that regions NH2-terminal of the SH3 domain ma...[]NaNp72DiscussionFalsehypothesis4209442236NaN
430430s273Previously , the COOH-terminal SH3 domain has ...[exLink]NaNp72DiscussionFalseresult4223742364NaN
431431s273we found that both the NH2 terminus and the CO...[inLink]f6p72DiscussionFalseresult4236542559NaN
432432s274In this respect , it is interesting[]NaNp72DiscussionFalseimplication4256042594NaN
433433s274to note that two regions which would fulfill t...[exLink]NaNp72DiscussionFalsehypothesis4259542743NaN
434434s275The interaction of Pex13p with Pex7p has far[]NaNp73DiscussionFalsefact4274442788NaN
435435s275reaching implications for our understanding of...[]NaNp73DiscussionFalseimplication4278942879NaN
436436s276Why are there several binding sites for the im...[]NaNp73DiscussionFalsefact4288042969NaN
437437s277One hypothesis suggests[]NaNp73DiscussionFalsehypothesis4297042993NaN
438438s277that the multiple interactions reflect the exi...[exLink]NaNp73DiscussionFalsehypothesis4299443203NaN
439439s278Our confirmation that at least two peroxisomal...[]NaNp73DiscussionFalsehypothesis4320443303NaN
440440s278about which functions as the docking protein f...[]NaNp73DiscussionFalsehypothesis4330443381NaN
441441s279Experimental evidence that Pex13p may be the d...[exLink]NaNp73DiscussionFalsehypothesis4338243508NaN
442442s279the unsolved questions stress the need for rel...[]NaNp73DiscussionFalsehypothesis4350943662NaN
\n", + "

443 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 SentenceId Clause Text \\\n", + "0 0 s1 Involvement of Pex13p in Pex14p Localization a... \n", + "1 1 s2 Pex13p is the putative docking protein for per... \n", + "2 2 s3 Pex14p interacts with both the PTS1 - and PTS2... \n", + "3 3 s4 We report the involvement of Pex13p in peroxis... \n", + "4 4 s5 Like Pex14p , \n", + "5 5 s5 Pex13p not only interacts with the PTS1-recept... \n", + "6 6 s6 In support of distinct peroxisomal binding sit... \n", + "7 7 s6 evidence for the interaction of Pex7p and Pex1... \n", + "8 8 s7 Accordingly , we conclude \n", + "9 9 s7 that Pex7p and Pex13p functionally interact du... \n", + "10 10 s8 NH2-terminal regions of Pex13p are required fo... \n", + "11 11 s8 while the COOH-terminal SH3 domain alone is su... \n", + "12 12 s9 Reinvestigation of the topology revealed both ... \n", + "13 13 s9 to be oriented towards the cytosol . \n", + "14 14 s10 We also found Pex13p to be required for peroxi... \n", + "15 15 s10 the SH3 domain of Pex13p may not provide the o... \n", + "16 16 s11 Peroxisomal matrix proteins are synthesized on... \n", + "17 17 s12 The presence of two distinct peroxisomal targe... \n", + "18 18 s13 PTS1 , present in the majority of peroxisomal ... \n", + "19 19 s13 for review see McNew and Goodman , 1996 ) . \n", + "20 20 s14 Only one known peroxisomal matrix protein in S... \n", + "21 21 s14 which is typically localized close to the NH2 ... \n", + "22 22 s15 Recognition of PTS1 and PTS2 targeting signals... \n", + "23 23 s15 for review see Subramani , 1996 ; Erdmann et a... \n", + "24 24 s16 Cells deficient in either protein display part... \n", + "25 25 s17 pex5Delta cells correctly localize PTS2 protei... \n", + "26 26 s18 pex7Delta cells exhibit the reverse phenotype \n", + "27 27 s18 for review see Elgersma and Tabak , 1996 ) . \n", + "28 28 s19 The intracellular localization of both targeti... \n", + "29 29 s20 A predominantly cytosolic , membrane-bound , a... \n", + ".. ... ... ... \n", + "413 413 s263 a Pex5p/Pex7p two-hybrid interaction is not ob... \n", + "414 414 s264 At first , this observation seems rather surpr... \n", + "415 415 s264 since both Pex5p and Pex7p independently inter... \n", + "416 416 s265 One could imagine \n", + "417 417 s265 that Pex13p may serve as a bridging molecule b... \n", + "418 418 s266 However , the amount of Pex5p simultaneously a... \n", + "419 419 s266 to give a positive response . \n", + "420 420 s267 In support of this assumption , the amount of ... \n", + "421 421 s268 Perhaps Pex13p does not usually associate simu... \n", + "422 422 s268 or association is transient . \n", + "423 423 s269 The domain of Pex13 that interacts with Pex7p ... \n", + "424 424 s269 where the interaction occurs , remains unknown . \n", + "425 425 s270 Furthermore , the intracellular localization o... \n", + "426 426 s271 One group has reported \n", + "427 427 s271 that the protein is exclusively localized in t... \n", + "428 428 s272 Because the SH3 domain alone does not mediate ... \n", + "429 429 s272 that regions NH2-terminal of the SH3 domain ma... \n", + "430 430 s273 Previously , the COOH-terminal SH3 domain has ... \n", + "431 431 s273 we found that both the NH2 terminus and the CO... \n", + "432 432 s274 In this respect , it is interesting \n", + "433 433 s274 to note that two regions which would fulfill t... \n", + "434 434 s275 The interaction of Pex13p with Pex7p has far \n", + "435 435 s275 reaching implications for our understanding of... \n", + "436 436 s276 Why are there several binding sites for the im... \n", + "437 437 s277 One hypothesis suggests \n", + "438 438 s277 that the multiple interactions reflect the exi... \n", + "439 439 s278 Our confirmation that at least two peroxisomal... \n", + "440 440 s278 about which functions as the docking protein f... \n", + "441 441 s279 Experimental evidence that Pex13p may be the d... \n", + "442 442 s279 the unsolved questions stress the need for rel... \n", + "\n", + " Codes ExperimentValues Paragraph Headings FloatingBox? \\\n", + "0 [] NaN - NaN False \n", + "1 [] NaN p3 NaN False \n", + "2 [] NaN p3 NaN False \n", + "3 [] NaN p3 NaN False \n", + "4 [] NaN p3 NaN False \n", + "5 [] NaN p3 NaN False \n", + "6 [] NaN p3 NaN False \n", + "7 [] NaN p3 NaN False \n", + "8 [] NaN p3 NaN False \n", + "9 [] NaN p3 NaN False \n", + "10 [] NaN p3 NaN False \n", + "11 [] NaN p3 NaN False \n", + "12 [] NaN p3 NaN False \n", + "13 [] NaN p3 NaN False \n", + "14 [] NaN p3 NaN False \n", + "15 [] NaN p3 NaN False \n", + "16 [exLink] NaN p4 NaN False \n", + "17 [] NaN p4 NaN False \n", + "18 [] NaN p4 NaN False \n", + "19 [exLink] NaN p4 NaN False \n", + "20 [] NaN p4 NaN False \n", + "21 [exLink] NaN p4 NaN False \n", + "22 [] NaN p5 NaN False \n", + "23 [exLink] NaN p5 NaN False \n", + "24 [] NaN p5 NaN False \n", + "25 [] NaN p5 NaN False \n", + "26 [] NaN p5 NaN False \n", + "27 [exLink] NaN p5 NaN False \n", + "28 [] NaN p5 NaN False \n", + "29 [] NaN p5 NaN False \n", + ".. ... ... ... ... ... \n", + "413 [inLink] f1 p71 Discussion False \n", + "414 [] NaN p71 Discussion False \n", + "415 [inLink] f4 p71 Discussion False \n", + "416 [] NaN p71 Discussion False \n", + "417 [] NaN p71 Discussion False \n", + "418 [] NaN p71 Discussion False \n", + "419 [] NaN p71 Discussion False \n", + "420 [inLink] f3 p71 Discussion False \n", + "421 [] NaN p71 Discussion False \n", + "422 [] NaN p71 Discussion False \n", + "423 [] NaN p72 Discussion False \n", + "424 [] NaN p72 Discussion False \n", + "425 [] NaN p72 Discussion False \n", + "426 [] NaN p72 Discussion False \n", + "427 [exLink] NaN p72 Discussion False \n", + "428 [] NaN p72 Discussion False \n", + "429 [] NaN p72 Discussion False \n", + "430 [exLink] NaN p72 Discussion False \n", + "431 [inLink] f6 p72 Discussion False \n", + "432 [] NaN p72 Discussion False \n", + "433 [exLink] NaN p72 Discussion False \n", + "434 [] NaN p73 Discussion False \n", + "435 [] NaN p73 Discussion False \n", + "436 [] NaN p73 Discussion False \n", + "437 [] NaN p73 Discussion False \n", + "438 [exLink] NaN p73 Discussion False \n", + "439 [] NaN p73 Discussion False \n", + "440 [] NaN p73 Discussion False \n", + "441 [exLink] NaN p73 Discussion False \n", + "442 [] NaN p73 Discussion False \n", + "\n", + " Discourse Type Offset_Begin Offset_End fig_spans \n", + "0 none 127 248 NaN \n", + "1 fact 799 922 NaN \n", + "2 fact 923 1077 NaN \n", + "3 fact 1078 1164 NaN \n", + "4 none 1165 1177 NaN \n", + "5 fact 1178 1325 NaN \n", + "6 hypothesis 1326 1462 NaN \n", + "7 implication 1463 1619 NaN \n", + "8 hypothesis 1620 1644 NaN \n", + "9 hypothesis 1645 1743 NaN \n", + "10 implication 1744 1830 NaN \n", + "11 implication 1831 1936 NaN \n", + "12 result 1937 2000 NaN \n", + "13 implication 2001 2036 NaN \n", + "14 result 2037 2115 NaN \n", + "15 implication 2116 2218 NaN \n", + "16 fact 2267 2419 NaN \n", + "17 fact 2420 2583 NaN \n", + "18 fact 2584 2736 NaN \n", + "19 fact 2737 2777 NaN \n", + "20 fact 2778 2901 NaN \n", + "21 fact 2902 3086 NaN \n", + "22 fact 3087 3215 NaN \n", + "23 fact 3216 3270 NaN \n", + "24 fact 3271 3341 NaN \n", + "25 fact 3342 3444 NaN \n", + "26 result 3445 3491 NaN \n", + "27 result 3492 3533 NaN \n", + "28 implication 3534 3628 NaN \n", + "29 result 3629 3749 NaN \n", + ".. ... ... ... ... \n", + "413 result 40604 40681 NaN \n", + "414 implication 40682 40733 NaN \n", + "415 result 40734 40830 NaN \n", + "416 hypothesis 40831 40848 NaN \n", + "417 hypothesis 40849 41004 NaN \n", + "418 hypothesis 41005 41104 NaN \n", + "419 hypothesis 41105 41133 NaN \n", + "420 implication 41134 41345 NaN \n", + "421 implication 41346 41436 NaN \n", + "422 result 41437 41466 NaN \n", + "423 fact 41467 41561 NaN \n", + "424 fact 41562 41608 NaN \n", + "425 fact 41609 41699 NaN \n", + "426 result 41700 41722 NaN \n", + "427 result 41723 42008 NaN \n", + "428 implication 42009 42093 NaN \n", + "429 hypothesis 42094 42236 NaN \n", + "430 result 42237 42364 NaN \n", + "431 result 42365 42559 NaN \n", + "432 implication 42560 42594 NaN \n", + "433 hypothesis 42595 42743 NaN \n", + "434 fact 42744 42788 NaN \n", + "435 implication 42789 42879 NaN \n", + "436 fact 42880 42969 NaN \n", + "437 hypothesis 42970 42993 NaN \n", + "438 hypothesis 42994 43203 NaN \n", + "439 hypothesis 43204 43303 NaN \n", + "440 hypothesis 43304 43381 NaN \n", + "441 hypothesis 43382 43508 NaN \n", + "442 hypothesis 43509 43662 NaN \n", + "\n", + "[443 rows x 12 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "inFile = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/tsv_span/10087260_spans.tsv'\n", + "#/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/pathwayLogic/scidt_bioc_sentences_tsv/11777939.tsv'\n", + "tsv = pd.read_csv(inFile, sep='\\t')\n", + "\n", + "tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " fig_spans = row['fig_spans']\n", + " \n", + " j += 1\n", + " if(reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if ('methods' in heading.lower()):\n", + " continue\n", + " \n", + " r = 'X'\n", + " if(reachData != reachData):\n", + " r = '0'\n", + " \n", + " if(reachData == reachData):\n", + " hits += 1\n", + "\n", + " print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text ) \n", + " \n", + " text = re.sub(regex1,\"\",text)\n", + " sent = regex2.split(text)\n", + " sent = [w for w in sent if w not in stopwords and len(w)>0]\n", + " sentences.append(sent)\n", + "\n", + " if 'exLink' in codeStr:\n", + " continue\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.ipynb_checkpoints/05-03-17 Building CoSID frames as Linked Data.-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/05-03-17 Building CoSID frames as Linked Data.-checkpoint.ipynb new file mode 100644 index 0000000..286dcb3 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/05-03-17 Building CoSID frames as Linked Data.-checkpoint.ipynb @@ -0,0 +1,6 @@ +{ + "cells": [], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.ipynb_checkpoints/Analyzing Evidence + Claim Sentences -checkpoint.ipynb b/notebooks/.ipynb_checkpoints/Analyzing Evidence + Claim Sentences -checkpoint.ipynb new file mode 100644 index 0000000..5b95563 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/Analyzing Evidence + Claim Sentences -checkpoint.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "def retrieve_sentences_for_modeling(inFile, fid):\n", + " tsv = pd.read_csv(inFile, sep='\\t')\n", + " sentences = []\n", + "\n", + " sw = stopwords.words('english')\n", + " regex1 = re.compile(r\"[\\(\\)\\{\\}\\[\\]\\;\\.\\'\\\"\\,\\/\\_\\*]\", re.IGNORECASE)\n", + " regex2 = re.compile(r\"\\s+\", re.IGNORECASE)\n", + "\n", + " allHits = 0\n", + " hits = 0\n", + " j = 0\n", + " for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + "\n", + " j += 1\n", + " if (reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if 'exLink' in codeStr:\n", + " continue\n", + "\n", + " if ('methods' in str(heading).lower()):\n", + " continue\n", + "\n", + " r = 'X'\n", + " if (reachData != reachData):\n", + " r = '0'\n", + "\n", + " if (reachData == reachData):\n", + " hits += 1\n", + "\n", + " # print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text )\n", + "\n", + " text = re.sub(regex1, \"\", text)\n", + " sent = regex2.split(text)\n", + " sent = [w for w in sent if w not in sw and len(w)>0]\n", + " tup = (fid, sid, sent)\n", + " sentences.append(tup)\n", + "\n", + " return sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "from ipywidgets import FloatProgress\n", + "from IPython.display import display\n", + "\n", + "inDir = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4/'\n", + "sent_tup_list = []\n", + "\n", + "f = FloatProgress(min=0, max=100)\n", + "display(f)\n", + "\n", + "sent_list = []\n", + "for fn in os.listdir(inDir):\n", + " infile = inDir + \"/\" + fn\n", + " if (os.path.isfile(infile) and fn.endswith('.tsv')):\n", + " fid = fn.replace(\".tsv\", \"\")\n", + " f.value += 1\n", + " for tup in retrieve_sentences_for_modeling(infile, fid):\n", + " sent_list.append(tup[2]);" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim import corpora\n", + "\n", + "dictionary = corpora.Dictionary(sent_list)\n", + "#dictionary.save('/tmp/deerwester.dict')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "corpus = [dictionary.doc2bow(sent) for sent in sent_list]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dictionary.save(inDir + '/sent.dict')\n", + "corpora.MmCorpus.serialize(inDir + '/sent.mm', corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mm = corpora.MmCorpus(inDir + '/sent.mm')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MmCorpus(88353 documents, 72629 features, 1287192 non-zero entries)\n" + ] + } + ], + "source": [ + "print(mm)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models.ldamodel import LdaModel\n", + "lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0,\n", + " u'0.062*\"family\" + 0.045*\"defects\" + 0.043*\"members\" + 0.027*\"act\" + 0.022*\"proteins\" + 0.017*\"neurite\" + 0.016*\"subset\" + 0.014*\"chaperone\" + 0.013*\"percentage\" + 0.013*\"pronounced\"'),\n", + " (1,\n", + " u'0.058*\"even\" + 0.042*\"detect\" + 0.039*\"possibility\" + 0.027*\"partners\" + 0.022*\"abundance\" + 0.022*\"proteins\" + 0.020*\"interaction\" + 0.020*\"though\" + 0.019*\"We\" + 0.019*\"physical\"'),\n", + " (2,\n", + " u'0.111*\"residues\" + 0.076*\"Figures\" + 0.054*\"site\" + 0.052*\"conserved\" + 0.042*\"highly\" + 0.034*\"binding\" + 0.022*\"The\" + 0.018*\"critical\" + 0.017*\"region\" + 0.016*\"peptide\"'),\n", + " (3,\n", + " u'0.058*\"subunits\" + 0.047*\"subunit\" + 0.036*\"catalytic\" + 0.033*\"vesicles\" + 0.024*\"marker\" + 0.023*\"infected\" + 0.018*\"myostatin\" + 0.017*\"added\" + 0.017*\"module\" + 0.016*\"latent\"'),\n", + " (4,\n", + " u'0.110*\"Figure\" + 0.081*\"S1\" + 0.056*\"1A\" + 0.024*\"weak\" + 0.021*\"2E\" + 0.019*\"embryos\" + 0.018*\"potentially\" + 0.018*\"alanine\" + 0.018*\"dephosphorylation\" + 0.015*\"upregulated\"'),\n", + " (5,\n", + " u'0.078*\"cells\" + 0.059*\"endogenous\" + 0.050*\"staining\" + 0.031*\"nucleus\" + 0.030*\"pattern\" + 0.026*\"cytoplasm\" + 0.026*\"localized\" + 0.023*\"Fig\" + 0.021*\"ubiquitination\" + 0.016*\"HeLa\"'),\n", + " (6,\n", + " u'0.057*\"indicates\" + 0.048*\"Additional\" + 0.047*\"treated\" + 0.040*\"Consistent\" + 0.037*\"file\" + 0.036*\"cells\" + 0.030*\"measured\" + 0.025*\"Figure\" + 0.022*\"1\" + 0.019*\"ERK\"'),\n", + " (7,\n", + " u'0.102*\"used\" + 0.052*\"major\" + 0.031*\"associate\" + 0.024*\"proteins\" + 0.020*\"protein\" + 0.019*\"One\" + 0.019*\"IgG\" + 0.017*\"stably\" + 0.016*\"recognized\" + 0.015*\"cancers\"'),\n", + " (8,\n", + " u'0.147*\"role\" + 0.042*\"differences\" + 0.041*\"important\" + 0.038*\"substrate\" + 0.026*\"extent\" + 0.024*\"plays\" + 0.022*\"central\" + 0.022*\"--\" + 0.020*\"The\" + 0.020*\"play\"'),\n", + " (9,\n", + " u'0.076*\"domains\" + 0.051*\"motif\" + 0.040*\"domain\" + 0.040*\"binding\" + 0.023*\"specificity\" + 0.023*\"SH3\" + 0.020*\"proteins\" + 0.019*\"putative\" + 0.017*\"contain\" + 0.015*\"PKA\"'),\n", + " (10,\n", + " u'0.092*\"mutant\" + 0.047*\"single\" + 0.041*\"actin\" + 0.034*\"mutants\" + 0.022*\"Further\" + 0.022*\"double\" + 0.019*\"defect\" + 0.018*\"isoform\" + 0.016*\"towards\" + 0.015*\"cytoskeleton\"'),\n", + " (11,\n", + " u'0.052*\"005\" + 0.026*\"Tau\" + 0.022*\"abrogated\" + 0.022*\"Dab2\" + 0.022*\"Rab7\" + 0.021*\"LRP6\" + 0.019*\"Wnt\" + 0.018*\"signaling\" + 0.017*\"c-Met\" + 0.017*\"Lyn\"'),\n", + " (12,\n", + " u'0.089*\"panel\" + 0.058*\"Fig\" + 0.039*\"right\" + 0.034*\"left\" + 0.029*\"panels\" + 0.026*\"experiment\" + 0.023*\"top\" + 0.023*\"Figs\" + 0.020*\"retained\" + 0.020*\"bottom\"'),\n", + " (13,\n", + " u'0.077*\"WT\" + 0.045*\"cells\" + 0.044*\"obtained\" + 0.044*\"lower\" + 0.031*\"data\" + 0.031*\"results\" + 0.027*\"Similar\" + 0.024*\"much\" + 0.023*\"wt\" + 0.020*\"MEFs\"'),\n", + " (14,\n", + " u'0.150*\"interactions\" + 0.040*\"proteins\" + 0.029*\"structural\" + 0.024*\"motifs\" + 0.023*\"peptides\" + 0.022*\"interface\" + 0.022*\"interaction\" + 0.020*\"rather\" + 0.019*\"common\" + 0.018*\"The\"'),\n", + " (15,\n", + " u'0.064*\"size\" + 0.050*\"case\" + 0.029*\"agreement\" + 0.028*\"sensitive\" + 0.028*\"regulatory\" + 0.027*\"In\" + 0.025*\"muscle\" + 0.025*\"reduces\" + 0.022*\"clathrin\" + 0.022*\"3E\"'),\n", + " (16,\n", + " u'0.144*\"levels\" + 0.077*\"expression\" + 0.042*\"mRNA\" + 0.037*\"Figure\" + 0.036*\"low\" + 0.031*\"high\" + 0.028*\"normal\" + 0.028*\"protein\" + 0.019*\"level\" + 0.017*\"3C\"'),\n", + " (17,\n", + " u'0.090*\"type\" + 0.071*\"inhibitor\" + 0.052*\"exhibited\" + 0.044*\"wild\" + 0.028*\"mutant\" + 0.025*\"analyzed\" + 0.023*\"III\" + 0.022*\"efficiency\" + 0.020*\"lead\" + 0.019*\"defective\"'),\n", + " (18,\n", + " u'0.052*\"using\" + 0.051*\"antibody\" + 0.038*\"confirmed\" + 0.035*\"detected\" + 0.035*\"Figure\" + 0.027*\"analysis\" + 0.025*\"Fig\" + 0.021*\"lysates\" + 0.020*\"shown\" + 0.020*\"extracts\"'),\n", + " (19,\n", + " u'0.060*\"manner\" + 0.043*\"patients\" + 0.032*\"amounts\" + 0.030*\"processing\" + 0.026*\"increasing\" + 0.019*\"trafficking\" + 0.018*\"disruption\" + 0.017*\"dose-dependent\" + 0.017*\"unaffected\" + 0.017*\"exposure\"'),\n", + " (20,\n", + " u'0.070*\"positive\" + 0.069*\"p\" + 0.048*\"ER\" + 0.035*\"contained\" + 0.034*\"altered\" + 0.033*\"ratio\" + 0.023*\"7C\" + 0.020*\"effective\" + 0.019*\"pairs\" + 0.015*\"inhibiting\"'),\n", + " (21,\n", + " u'0.164*\"domain\" + 0.080*\"additional\" + 0.064*\"region\" + 0.063*\"data\" + 0.056*\"file\" + 0.054*\"Click\" + 0.039*\"C-terminal\" + 0.034*\"N-terminal\" + 0.017*\"part\" + 0.017*\"binding\"'),\n", + " (22,\n", + " u'0.053*\"following\" + 0.035*\"infection\" + 0.035*\"G\" + 0.034*\"cells\" + 0.028*\"virus\" + 0.028*\"remained\" + 0.018*\"H\" + 0.018*\"Fig\" + 0.016*\"particles\" + 0.015*\"incubated\"'),\n", + " (23,\n", + " u'0.107*\"increased\" + 0.082*\"significantly\" + 0.063*\"cells\" + 0.046*\"compared\" + 0.043*\"Figure\" + 0.035*\"decreased\" + 0.032*\"higher\" + 0.031*\"reduced\" + 0.017*\"concentration\" + 0.017*\"amount\"'),\n", + " (24,\n", + " u'0.105*\"Our\" + 0.046*\"cell\" + 0.046*\"data\" + 0.029*\"Together\" + 0.028*\"development\" + 0.026*\"suggest\" + 0.026*\"increases\" + 0.025*\"cycle\" + 0.022*\"mechanisms\" + 0.021*\"results\"'),\n", + " (25,\n", + " u'0.077*\"localization\" + 0.037*\"distribution\" + 0.025*\"kinases\" + 0.024*\"enrichment\" + 0.021*\"subcellular\" + 0.019*\"possibly\" + 0.017*\"summary\" + 0.016*\"In\" + 0.014*\"investigated\" + 0.014*\"consequence\"'),\n", + " (26,\n", + " u'0.102*\"cells\" + 0.087*\"T\" + 0.031*\"Fig\" + 0.026*\"concentrations\" + 0.021*\"relatively\" + 0.019*\"hours\" + 0.018*\"CD4+\" + 0.017*\"cell\" + 0.013*\"days\" + 0.013*\"transient\"'),\n", + " (27,\n", + " u'0.089*\"replication\" + 0.035*\"density\" + 0.034*\"dimer\" + 0.033*\"complete\" + 0.028*\"Figure\" + 0.026*\"enzyme\" + 0.022*\"partial\" + 0.021*\"Hsp27\" + 0.018*\"equivalent\" + 0.017*\"4E\"'),\n", + " (28,\n", + " u'0.063*\">\" + 0.045*\"N\" + 0.032*\"beta\" + 0.027*\"per\" + 0.025*\"probably\" + 0.025*\"Notably\" + 0.023*\"causes\" + 0.023*\"Of\" + 0.022*\"alpha\" + 0.021*\"animals\"'),\n", + " (29,\n", + " u'0.072*\"activation\" + 0.039*\"promoter\" + 0.037*\"activity\" + 0.032*\"stimulation\" + 0.030*\"cells\" + 0.027*\"Fig\" + 0.026*\"expression\" + 0.025*\"induced\" + 0.024*\"inhibited\" + 0.022*\"reporter\"'),\n", + " (30,\n", + " u'0.039*\"c\" + 0.032*\"Fig\" + 0.030*\"cells\" + 0.030*\"E-cadherin\" + 0.025*\"d\" + 0.020*\"lung\" + 0.019*\"vinculin\" + 0.019*\"mM\" + 0.017*\"cytosol\" + 0.017*\"mature\"'),\n", + " (31,\n", + " u'0.055*\"cells\" + 0.047*\"knockdown\" + 0.042*\"resulted\" + 0.038*\"reduction\" + 0.037*\"siRNA\" + 0.035*\"decrease\" + 0.034*\"Fig\" + 0.027*\"caused\" + 0.023*\"impaired\" + 0.020*\"expression\"'),\n", + " (32,\n", + " u'0.042*\"residue\" + 0.032*\"chain\" + 0.031*\"position\" + 0.027*\"identical\" + 0.025*\"appear\" + 0.021*\"side\" + 0.018*\"mutated\" + 0.018*\"In\" + 0.018*\"PD\" + 0.017*\"selective\"'),\n", + " (33,\n", + " u'0.111*\"<\" + 0.090*\"P\" + 0.049*\"neurons\" + 0.037*\"comparison\" + 0.021*\"001\" + 0.021*\"bud\" + 0.018*\"Analysis\" + 0.018*\"0001\" + 0.017*\"48\" + 0.017*\"significant\"'),\n", + " (34,\n", + " u'0.129*\"Figure\" + 0.052*\"4C\" + 0.038*\"5C\" + 0.035*\"4D\" + 0.027*\"MAVS\" + 0.021*\"S2\" + 0.021*\"NSP1\" + 0.016*\"rapid\" + 0.015*\"x\" + 0.014*\"c-Myc\"'),\n", + " (35,\n", + " u'0.130*\"results\" + 0.129*\"These\" + 0.065*\"suggest\" + 0.050*\"indicate\" + 0.044*\"vivo\" + 0.040*\"data\" + 0.031*\"findings\" + 0.020*\"observations\" + 0.018*\"interaction\" + 0.018*\"may\"'),\n", + " (36,\n", + " u'0.070*\"cells\" + 0.066*\"surface\" + 0.051*\"2A\" + 0.045*\"expression\" + 0.043*\"cell\" + 0.040*\"induction\" + 0.026*\"induces\" + 0.026*\"leads\" + 0.022*\"Expression\" + 0.021*\"epithelial\"'),\n", + " (37,\n", + " u'0.100*\"Table\" + 0.099*\"1\" + 0.058*\"Fig\" + 0.032*\"2\" + 0.032*\"S2\" + 0.021*\"supplementary\" + 0.019*\"sensitivity\" + 0.019*\"Information\" + 0.017*\"proteins\" + 0.016*\"online\"'),\n", + " (38,\n", + " u'0.046*\"early\" + 0.042*\"strain\" + 0.030*\"APP\" + 0.029*\"late\" + 0.027*\"endosomes\" + 0.024*\"stage\" + 0.021*\"Data\" + 0.019*\"lysosomal\" + 0.016*\"lysosomes\" + 0.015*\"CMA\"'),\n", + " (39,\n", + " u'0.140*\"effect\" + 0.044*\"without\" + 0.037*\"controls\" + 0.030*\"inhibitory\" + 0.026*\"showing\" + 0.025*\"little\" + 0.022*\"supported\" + 0.022*\"HIV-1\" + 0.020*\"GTP\" + 0.017*\"patterns\"'),\n", + " (40,\n", + " u'0.040*\"signalling\" + 0.034*\"support\" + 0.032*\"synthesis\" + 0.031*\"combination\" + 0.027*\"experimental\" + 0.024*\"physiological\" + 0.022*\"conclusion\" + 0.020*\"test\" + 0.020*\"functionally\" + 0.017*\"To\"'),\n", + " (41,\n", + " u'0.066*\"whether\" + 0.043*\"determined\" + 0.042*\"examined\" + 0.037*\"survival\" + 0.033*\"lack\" + 0.031*\"produced\" + 0.024*\"secretion\" + 0.022*\"around\" + 0.021*\"We\" + 0.021*\"resistance\"'),\n", + " (42,\n", + " u'0.070*\"It\" + 0.047*\"possible\" + 0.038*\"For\" + 0.029*\"blocked\" + 0.025*\"example\" + 0.023*\"yet\" + 0.021*\"foci\" + 0.020*\"approach\" + 0.019*\"AKT\" + 0.018*\"action\"'),\n", + " (43,\n", + " u'0.039*\"12\" + 0.032*\"11\" + 0.030*\"represent\" + 0.020*\"zinc\" + 0.019*\"standard\" + 0.018*\"bars\" + 0.018*\"value\" + 0.017*\"colocalized\" + 0.017*\"genomic\" + 0.016*\"pH\"'),\n", + " (44,\n", + " u'0.214*\"complex\" + 0.091*\"formation\" + 0.019*\"form\" + 0.018*\"component\" + 0.013*\"context\" + 0.013*\"complexes\" + 0.012*\"provided\" + 0.011*\"We\" + 0.010*\"involvement\" + 0.009*\"DNMT1\"'),\n", + " (45,\n", + " u'0.063*\"inhibition\" + 0.049*\"activity\" + 0.047*\"enhanced\" + 0.044*\"due\" + 0.039*\"migration\" + 0.037*\"mediated\" + 0.037*\"dependent\" + 0.036*\"via\" + 0.029*\"Importantly\" + 0.027*\"transcriptional\"'),\n", + " (46,\n", + " u'0.126*\"mice\" + 0.077*\"conditions\" + 0.036*\"proliferation\" + 0.031*\"cells\" + 0.025*\"vector\" + 0.021*\"find\" + 0.015*\"cultured\" + 0.014*\"expression\" + 0.013*\"control\" + 0.012*\"transgenic\"'),\n", + " (47,\n", + " u'0.043*\"membranes\" + 0.035*\"dissociation\" + 0.031*\"glucose\" + 0.023*\"22\" + 0.021*\"constitutive\" + 0.021*\"mm\" + 0.021*\"labeling\" + 0.020*\"distal\" + 0.017*\"80\" + 0.016*\"chromosomes\"'),\n", + " (48,\n", + " u'0.076*\"kinase\" + 0.075*\"signal\" + 0.046*\"pathways\" + 0.033*\"The\" + 0.030*\"length\" + 0.021*\"genetic\" + 0.019*\"extracellular\" + 0.019*\"capable\" + 0.018*\"full\" + 0.017*\"activity\"'),\n", + " (49,\n", + " u'0.081*\"phenotype\" + 0.044*\"samples\" + 0.043*\"2D\" + 0.024*\"rescue\" + 0.022*\"nuclei\" + 0.022*\"noted\" + 0.013*\"significant\" + 0.013*\"statistically\" + 0.013*\"The\" + 0.013*\"maintenance\"'),\n", + " (50,\n", + " u'0.080*\"molecular\" + 0.041*\"protein\" + 0.036*\"mass\" + 0.031*\"SIRT1\" + 0.024*\"impact\" + 0.023*\"weight\" + 0.018*\"predicted\" + 0.017*\"extensive\" + 0.015*\"The\" + 0.015*\"1E\"'),\n", + " (51,\n", + " u'0.034*\"translation\" + 0.030*\"clusters\" + 0.026*\"MuSK\" + 0.025*\"cluster\" + 0.024*\"DDX3\" + 0.022*\"CENP-E\" + 0.020*\"Rab32\" + 0.019*\"hBUBR1\" + 0.017*\"PABP\" + 0.016*\"Rab8\"'),\n", + " (52,\n", + " u'0.087*\"Fig\" + 0.059*\"C\" + 0.053*\"D\" + 0.037*\"E\" + 0.028*\"network\" + 0.027*\"F\" + 0.026*\"stress\" + 0.019*\"B\" + 0.015*\"1\" + 0.014*\"F-actin\"'),\n", + " (53,\n", + " u'0.043*\"least\" + 0.036*\"molecules\" + 0.033*\"two\" + 0.032*\"group\" + 0.030*\"interacted\" + 0.029*\"one\" + 0.027*\"proteins\" + 0.022*\"tail\" + 0.021*\"peak\" + 0.017*\"groups\"'),\n", + " (54,\n", + " u'0.055*\"vitro\" + 0.054*\"phosphorylated\" + 0.048*\"assays\" + 0.043*\"assay\" + 0.031*\"purified\" + 0.030*\"binding\" + 0.025*\"affinity\" + 0.022*\"bound\" + 0.018*\"Fig\" + 0.016*\"interaction\"'),\n", + " (55,\n", + " u'0.086*\"transfected\" + 0.074*\"cells\" + 0.037*\"constructs\" + 0.035*\"stable\" + 0.018*\"transfection\" + 0.017*\"expressing\" + 0.016*\"HEK293\" + 0.016*\"capacity\" + 0.015*\"VEGF\" + 0.014*\"plasmids\"'),\n", + " (56,\n", + " u'0.078*\"degradation\" + 0.062*\"RNA\" + 0.056*\"3B\" + 0.054*\"therefore\" + 0.032*\"appears\" + 0.029*\"along\" + 0.028*\"targets\" + 0.028*\"On\" + 0.023*\"hand\" + 0.022*\"promotes\"'),\n", + " (57,\n", + " u'0.087*\"study\" + 0.043*\"In\" + 0.041*\"potential\" + 0.041*\"novel\" + 0.026*\"present\" + 0.024*\"new\" + 0.023*\"presented\" + 0.022*\"Here\" + 0.022*\"protein\" + 0.021*\"Myo5p\"'),\n", + " (58,\n", + " u'0.173*\"phosphorylation\" + 0.050*\"sites\" + 0.029*\"kinase\" + 0.028*\"tyrosine\" + 0.028*\"14-3-3\" + 0.024*\"binding\" + 0.024*\"receptors\" + 0.021*\"isoforms\" + 0.019*\"inhibitors\" + 0.013*\"many\"'),\n", + " (59,\n", + " u'0.044*\"ATP\" + 0.035*\"suppression\" + 0.031*\"neither\" + 0.028*\"Supplemental\" + 0.027*\"transmembrane\" + 0.021*\"particularly\" + 0.021*\"gel\" + 0.021*\"allow\" + 0.019*\"frequency\" + 0.018*\"puncta\"'),\n", + " (60,\n", + " u'0.265*\":\" + 0.026*\"available\" + 0.024*\"disease\" + 0.015*\"genome\" + 0.015*\"detection\" + 0.014*\"Arabidopsis\" + 0.011*\"information\" + 0.010*\"closely\" + 0.009*\"human\" + 0.009*\"PPI\"'),\n", + " (61,\n", + " u'0.049*\"sequence\" + 0.035*\"targeting\" + 0.031*\"sequences\" + 0.029*\"b\" + 0.025*\"mitochondrial\" + 0.024*\"related\" + 0.023*\"species\" + 0.020*\"protein\" + 0.019*\"S\" + 0.018*\"degree\"'),\n", + " (62,\n", + " u'0.053*\"total\" + 0.046*\"kDa\" + 0.033*\"Both\" + 0.029*\"Gin4\" + 0.026*\"larger\" + 0.022*\"intensity\" + 0.022*\"rRNA\" + 0.020*\"values\" + 0.018*\"NOD2\" + 0.016*\"Drosophila\"'),\n", + " (63,\n", + " u'0.084*\"DNA\" + 0.072*\"transcription\" + 0.046*\"factors\" + 0.030*\"chromatin\" + 0.025*\"acetylation\" + 0.025*\"binding\" + 0.021*\"promoter\" + 0.020*\"set\" + 0.018*\"activity\" + 0.017*\"factor\"'),\n", + " (64,\n", + " u'0.064*\"I\" + 0.043*\"2C\" + 0.038*\"II\" + 0.035*\"p53\" + 0.033*\"FAK\" + 0.026*\"Pol\" + 0.025*\"RNAi\" + 0.024*\"Akt\" + 0.021*\"REG\" + 0.017*\"E2F1\"'),\n", + " (65,\n", + " u'0.059*\"alone\" + 0.043*\"fusion\" + 0.034*\"construct\" + 0.030*\"Fig\" + 0.030*\"recombinant\" + 0.027*\"protein\" + 0.021*\"soluble\" + 0.020*\"GST\" + 0.019*\"Src\" + 0.018*\"G2019S\"'),\n", + " (66,\n", + " u'0.060*\"changes\" + 0.050*\"change\" + 0.029*\"death\" + 0.024*\"enriched\" + 0.020*\"characterized\" + 0.018*\"conformational\" + 0.018*\"binding\" + 0.016*\"The\" + 0.016*\"neuronal\" + 0.016*\"adjacent\"'),\n", + " (67,\n", + " u'0.061*\"EGFR\" + 0.035*\"endocytosis\" + 0.030*\"There\" + 0.026*\"molecule\" + 0.023*\"occur\" + 0.020*\"eg\" + 0.020*\"chains\" + 0.019*\"modification\" + 0.019*\"EGF\" + 0.018*\"contacts\"'),\n", + " (68,\n", + " u'0.047*\"fraction\" + 0.044*\"Fig\" + 0.042*\"antibodies\" + 0.038*\"cells\" + 0.038*\"5B\" + 0.037*\"fractions\" + 0.028*\"performed\" + 0.022*\"Western\" + 0.021*\"blotting\" + 0.017*\"immunoprecipitated\"'),\n", + " (69,\n", + " u'0.038*\"By\" + 0.033*\"OPTN\" + 0.033*\"smaller\" + 0.029*\"mitosis\" + 0.029*\"overexpressing\" + 0.022*\"contrast\" + 0.020*\"instead\" + 0.018*\"portion\" + 0.016*\"ATRX\" + 0.016*\"resistant\"'),\n", + " (70,\n", + " u'0.158*\"%\" + 0.132*\"-\" + 0.058*\"=\" + 0.045*\"+\" + 0.033*\"10\" + 0.027*\"min\" + 0.023*\"respectively\" + 0.019*\"viral\" + 0.016*\"n\" + 0.015*\"50\"'),\n", + " (71,\n", + " u'0.089*\"active\" + 0.066*\"3A\" + 0.037*\"close\" + 0.029*\"proximity\" + 0.026*\"combined\" + 0.024*\"established\" + 0.020*\"linked\" + 0.020*\"readily\" + 0.015*\"form\" + 0.014*\"viruses\"'),\n", + " (72,\n", + " u'0.177*\"LRRK2\" + 0.035*\"tumors\" + 0.026*\"aggregates\" + 0.021*\"clones\" + 0.020*\"microscopy\" + 0.018*\"CD\" + 0.018*\"Based\" + 0.014*\"box\" + 0.014*\"ArfGAP1\" + 0.012*\"We\"'),\n", + " (73,\n", + " u'0.047*\"upstream\" + 0.039*\"tissue\" + 0.034*\"demonstrating\" + 0.031*\"correlated\" + 0.027*\"apical\" + 0.022*\"associates\" + 0.019*\"TLP\" + 0.018*\"TFIIA\" + 0.015*\"peripheral\" + 0.015*\"networks\"'),\n", + " (74,\n", + " u'0.055*\"yeast\" + 0.039*\"previously\" + 0.035*\"interaction\" + 0.035*\"system\" + 0.032*\"clearly\" + 0.030*\"reported\" + 0.027*\"demonstrate\" + 0.019*\"described\" + 0.019*\"binding\" + 0.018*\"proteins\"'),\n", + " (75,\n", + " u'0.114*\"Fig\" + 0.065*\"3\" + 0.057*\"4\" + 0.040*\"5\" + 0.040*\"B\" + 0.039*\"A\" + 0.037*\"nuclear\" + 0.037*\"6\" + 0.036*\"2\" + 0.030*\"Figure\"'),\n", + " (76,\n", + " u'0.107*\"Figure\" + 0.055*\"5A\" + 0.039*\"efficiently\" + 0.026*\"GC\" + 0.023*\"plants\" + 0.016*\"turn\" + 0.015*\"preferentially\" + 0.014*\"COP\" + 0.014*\"S7\" + 0.013*\"TMZ\"'),\n", + " (77,\n", + " u'0.024*\"transport\" + 0.023*\"like\" + 0.022*\"screen\" + 0.020*\"recovered\" + 0.020*\"minor\" + 0.018*\"INCENP\" + 0.017*\"Roquin\" + 0.016*\"First\" + 0.015*\"arrows\" + 0.013*\"maturation\"'),\n", + " (78,\n", + " u'0.069*\"growth\" + 0.066*\"cells\" + 0.058*\"time\" + 0.053*\"h\" + 0.043*\"cell\" + 0.037*\"treatment\" + 0.035*\"apoptosis\" + 0.025*\"24\" + 0.022*\"rate\" + 0.019*\"At\"'),\n", + " (79,\n", + " u'0.083*\"amino\" + 0.051*\"target\" + 0.051*\"acid\" + 0.045*\"acids\" + 0.030*\"individual\" + 0.027*\"promote\" + 0.027*\"1C\" + 0.022*\"immune\" + 0.021*\"ubiquitin\" + 0.013*\"over-expression\"'),\n", + " (80,\n", + " u'0.096*\"three\" + 0.053*\"independent\" + 0.039*\"four\" + 0.033*\"proteins\" + 0.028*\"All\" + 0.026*\"almost\" + 0.023*\"block\" + 0.019*\"experiments\" + 0.018*\"general\" + 0.017*\"inactive\"'),\n", + " (81,\n", + " u'0.062*\"4B\" + 0.048*\"Figure\" + 0.042*\"interacting\" + 0.036*\"appeared\" + 0.031*\"START\" + 0.024*\"Hsp90\" + 0.020*\"proteins\" + 0.018*\"The\" + 0.017*\"light\" + 0.016*\"5E\"'),\n", + " (82,\n", + " u'0.087*\"mutation\" + 0.048*\"completely\" + 0.043*\"Figure\" + 0.034*\"S5\" + 0.034*\"abolished\" + 0.031*\"inhibits\" + 0.031*\"binding\" + 0.025*\"mutations\" + 0.021*\"interaction\" + 0.021*\"integrin\"'),\n", + " (83,\n", + " u'0.063*\"relative\" + 0.057*\"basal\" + 0.037*\"cleavage\" + 0.031*\"nM\" + 0.026*\"mean\" + 0.026*\"phase\" + 0.023*\"cause\" + 0.018*\"sample\" + 0.017*\"M\" + 0.016*\"G1\"'),\n", + " (84,\n", + " u'0.058*\"4A\" + 0.055*\"production\" + 0.054*\"Figure\" + 0.048*\"S3\" + 0.047*\"NS5A\" + 0.027*\"colocalization\" + 0.023*\"observe\" + 0.023*\"cells\" + 0.021*\"despite\" + 0.021*\"PI4KIIIalpha\"'),\n", + " (85,\n", + " u'0.093*\"Figure\" + 0.040*\"6A\" + 0.034*\"HCV\" + 0.033*\"6B\" + 0.026*\"tau\" + 0.026*\"6C\" + 0.026*\"generated\" + 0.026*\"co-localization\" + 0.020*\"confirm\" + 0.019*\"negative\"'),\n", + " (86,\n", + " u'0.041*\"suggests\" + 0.039*\"may\" + 0.033*\"This\" + 0.033*\"cellular\" + 0.031*\"regulation\" + 0.029*\"mechanism\" + 0.027*\"proteins\" + 0.027*\"functions\" + 0.026*\"evidence\" + 0.023*\"involved\"'),\n", + " (87,\n", + " u'0.140*\"gene\" + 0.057*\"mouse\" + 0.054*\"expression\" + 0.036*\"silencing\" + 0.030*\"substrates\" + 0.022*\"LAX\" + 0.022*\"upper\" + 0.021*\"CTLA-4\" + 0.018*\"fibroblasts\" + 0.016*\"open\"'),\n", + " (88,\n", + " u'0.078*\"tumor\" + 0.045*\"factor\" + 0.034*\"mitotic\" + 0.029*\"red\" + 0.028*\"plasmid\" + 0.025*\"growth\" + 0.024*\"cells\" + 0.021*\"effector\" + 0.021*\"green\" + 0.021*\"initiation\"'),\n", + " (89,\n", + " u'0.064*\"deletion\" + 0.043*\"lacking\" + 0.033*\"fragment\" + 0.032*\"Finally\" + 0.030*\"ie\" + 0.021*\"allele\" + 0.021*\"step\" + 0.021*\"subsequent\" + 0.018*\"mutant\" + 0.017*\"lost\"'),\n", + " (90,\n", + " u'0.106*\"structure\" + 0.035*\"among\" + 0.029*\"conclude\" + 0.029*\"located\" + 0.021*\"The\" + 0.020*\"native\" + 0.018*\"chromosome\" + 0.017*\"A\" + 0.016*\"dystrophin\" + 0.012*\"We\"'),\n", + " (91,\n", + " u'0.070*\"pathway\" + 0.067*\"signaling\" + 0.055*\"together\" + 0.031*\"Taken\" + 0.031*\"beta-catenin\" + 0.029*\"activation\" + 0.024*\"downstream\" + 0.019*\"data\" + 0.018*\"regulates\" + 0.016*\"state\"'),\n", + " (92,\n", + " u'0.078*\"et\" + 0.078*\"al\" + 0.037*\"PTEN\" + 0.023*\"targeted\" + 0.021*\"motility\" + 0.014*\"Pan3\" + 0.014*\"&\" + 0.013*\"ERK12\" + 0.011*\"Pan2\" + 0.011*\"also\"'),\n", + " (93,\n", + " u'0.131*\"membrane\" + 0.098*\"Figure\" + 0.053*\"2B\" + 0.040*\"plasma\" + 0.032*\"S4\" + 0.029*\"3D\" + 0.019*\"majority\" + 0.019*\"mainly\" + 0.017*\"culture\" + 0.016*\"Golgi\"'),\n", + " (94,\n", + " u'0.035*\"fully\" + 0.030*\"calreticulin\" + 0.025*\"robust\" + 0.023*\"repair\" + 0.018*\"presumably\" + 0.015*\"c-Fos\" + 0.014*\"TLR9\" + 0.013*\"cisplatin\" + 0.013*\"deficient\" + 0.012*\"hMSH5\"'),\n", + " (95,\n", + " u'0.042*\"human\" + 0.034*\"slightly\" + 0.030*\"background\" + 0.028*\"PDI\" + 0.028*\"confirming\" + 0.028*\"5D\" + 0.027*\"depleted\" + 0.027*\"autophosphorylation\" + 0.025*\"determine\" + 0.021*\"rat\"'),\n", + " (96,\n", + " u'0.049*\"cells\" + 0.043*\"stability\" + 0.028*\"followed\" + 0.026*\"spindle\" + 0.026*\"invasion\" + 0.019*\"variant\" + 0.018*\"DACT1\" + 0.017*\"microtubules\" + 0.017*\"variants\" + 0.015*\"often\"'),\n", + " (97,\n", + " u'0.108*\"cell\" + 0.071*\"cancer\" + 0.056*\"lines\" + 0.042*\"cells\" + 0.040*\"Discussion\" + 0.034*\"breast\" + 0.029*\"line\" + 0.025*\"difference\" + 0.022*\"expression\" + 0.020*\"No\"'),\n", + " (98,\n", + " u'0.366*\"Supplementary\" + 0.038*\"Material\" + 0.038*\"Fig\" + 0.033*\"primary\" + 0.023*\"lipid\" + 0.022*\"Rb\" + 0.017*\"HAUSP\" + 0.016*\"secondary\" + 0.013*\"1b\" + 0.013*\"deficiency\"'),\n", + " (99,\n", + " u'0.154*\"Figure\" + 0.042*\"1B\" + 0.040*\"depletion\" + 0.032*\"GFP\" + 0.027*\"fluorescence\" + 0.023*\"In\" + 0.023*\"7A\" + 0.022*\"1D\" + 0.020*\"7B\" + 0.015*\"mRNAs\"')]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lda.print_topics(100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.ipynb_checkpoints/CoSID Gantts -checkpoint.ipynb b/notebooks/.ipynb_checkpoints/CoSID Gantts -checkpoint.ipynb new file mode 100644 index 0000000..a441b98 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/CoSID Gantts -checkpoint.ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "(function(global) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " if (typeof (window._bokeh_onload_callbacks) === \"undefined\") {\n", + " window._bokeh_onload_callbacks = [];\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " window._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", + " delete window._bokeh_onload_callbacks\n", + " console.info(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(js_urls, callback) {\n", + " window._bokeh_onload_callbacks.push(callback);\n", + " if (window._bokeh_is_loading > 0) {\n", + " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " window._bokeh_is_loading = js_urls.length;\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var s = document.createElement('script');\n", + " s.src = url;\n", + " s.async = false;\n", + " s.onreadystatechange = s.onload = function() {\n", + " window._bokeh_is_loading--;\n", + " if (window._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", + " run_callbacks()\n", + " }\n", + " };\n", + " s.onerror = function() {\n", + " console.warn(\"failed to load library \" + url);\n", + " };\n", + " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", + " }\n", + " };\n", + "\n", + " var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.11.1.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.11.1.min.js'];\n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " \n", + " function(Bokeh) {\n", + " Bokeh.$(\"#a0c72a2f-02d6-4e89-841b-ca1b77a4d633\").text(\"BokehJS successfully loaded\");\n", + " },\n", + " function(Bokeh) {\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.css\");\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.11.1.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.11.1.min.css\");\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i](window.Bokeh);\n", + " }\n", + " }\n", + "\n", + " if (window._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(js_urls, function() {\n", + " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(this));" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from __future__ import print_function, division\n", + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import seaborn as sns\n", + "from bokeh.plotting import figure, show, output_notebook, output_file\n", + "from bokeh.models import ColumnDataSource, Range1d\n", + "\n", + "output_notebook()" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SentenceIdClause TextCodesExperimentValuesExperimentSpanParagraphHeadingsFloatingBox?Discourse TypefriesSentenceIdfriesEventsIdsfriesEventsDetailsfriesEventText
0s1Mechanisms through which[][][]-NaNFalsenone----
1s1Sos-1 coordinates the activation of Ras and Rac[][][]-NaNFalsenone----
2s2Signaling from receptor tyrosine kinases ( RTK...[][][]p4NaNFalsenone----
3s2requires the sequential activation of the smal...[][][]p4NaNFalsefact----
4s3Son of sevenless ( Sos-1 ) , a bifunctional gu...[][][]p4NaNFalsefact----
\n", + "
" + ], + "text/plain": [ + " SentenceId Clause Text Codes \\\n", + "0 s1 Mechanisms through which [] \n", + "1 s1 Sos-1 coordinates the activation of Ras and Rac [] \n", + "2 s2 Signaling from receptor tyrosine kinases ( RTK... [] \n", + "3 s2 requires the sequential activation of the smal... [] \n", + "4 s3 Son of sevenless ( Sos-1 ) , a bifunctional gu... [] \n", + "\n", + " ExperimentValues ExperimentSpan Paragraph Headings FloatingBox? \\\n", + "0 [] [] - NaN False \n", + "1 [] [] - NaN False \n", + "2 [] [] p4 NaN False \n", + "3 [] [] p4 NaN False \n", + "4 [] [] p4 NaN False \n", + "\n", + " Discourse Type friesSentenceId friesEventsIds friesEventsDetails \\\n", + "0 none - - - \n", + "1 none - - - \n", + "2 none - - - \n", + "3 fact - - - \n", + "4 fact - - - \n", + "\n", + " friesEventText \n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - " + ] + }, + "execution_count": 319, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tsv = pd.read_csv('/Users/Gully/Documents/Projects/2_active/bigMech/work/2016-07-27-pathwayLogic/tsv1file/PMC2173577.scidp.discourse.tsv',\n", + " sep='\\t')\n", + "pmcId = \"PMC2173577\"\n", + "tsv.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 323, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import re\n", + "from sets import Set\n", + "\n", + "def read_codes(es):\n", + "\n", + " if( es != es ):\n", + " es = \"[]\"\n", + " \n", + " removeCommaAndRegex = re.compile(r\", and \", re.IGNORECASE)\n", + " es = removeCommaAndRegex.sub(\",\", es)\n", + "\n", + " removeAndRegex = re.compile(r\" and \", re.IGNORECASE)\n", + " es = removeAndRegex.sub(\",\", es)\n", + " \n", + " codes = Set()\n", + "\n", + " for c in re.findall('[Ss]{0,1}\\d+[\\s,]{0,2}[A-Za-z,;\\-\\s]*', es):\n", + "\n", + " #print(c)\n", + " \n", + " simpleM = re.match('\\d+$', c)\n", + " simpleSubM = re.match('\\d+[\\s,]{0,2}([A-Za-z])', c)\n", + " intM = re.match('\\d+[\\s,]{0,2}([A-Za-z]+)\\-([A-Za-z]+)', c)\n", + " comma2M = re.match('\\d+[\\s,]{0,2}([A-Za-z]+)[;,]\\s{0,1}([A-Za-z]+)', c)\n", + " comma3M = re.match('\\d+[\\s,]{0,2}([A-Za-z]+)[;,]\\s{0,1}([A-Za-z]+)[;,]\\s{0,1}([A-Za-z]+)', c)\n", + " \n", + " suppM = re.match('([Ss]){1,1}\\d+', c)\n", + " \n", + " figM = re.match('(\\d+)', c)\n", + " fig = figM.group(1)\n", + " \n", + " if( intM is not None ):\n", + " start = ord(intM.group(1))\n", + " end = ord(intM.group(2))\n", + " for ascii_code in range(start, end+1): \n", + " codes.add(fig + chr(ascii_code))\n", + " #print(\" int:\" + fig + chr(ascii_code))\n", + " \n", + " elif( comma3M is not None ):\n", + " codes.add(fig + comma3M.group(1))\n", + " codes.add(fig + comma3M.group(2))\n", + " codes.add(fig + comma3M.group(3))\n", + " #print(\" comma3:\" + fig + comma3M.group(1))\n", + " #print(\" comma3:\" + fig + comma3M.group(2))\n", + " #print(\" comma3:\" + fig + comma3M.group(3))\n", + " \n", + " elif( comma2M is not None ):\n", + " codes.add(fig + comma2M.group(1))\n", + " codes.add(fig + comma2M.group(2))\n", + " #print(\" comma2:\" + fig + comma2M.group(1))\n", + " #print(\" comma2:\" + fig + comma2M.group(2))\n", + " \n", + " elif( simpleM is not None ):\n", + " codes.add(fig)\n", + " #print(\" simple:\" + fig)\n", + " \n", + " elif( simpleSubM is not None ):\n", + " codes.add(fig + simpleSubM.group(1))\n", + " #print(\" simpleSub:\" + fig + simpleSubM.group(1))\n", + " \n", + " return codes" + ] + }, + { + "cell_type": "code", + "execution_count": 358, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " expt clause_id discourse_type heading expt_id color\n", + "0 1A 58 method Results 0 LightGray\n", + "1 1A 59 method Results 0 LightGray\n", + "2 1A 60 method Results 0 LightGray\n", + "3 1A 61 method Results 0 LightGray\n", + "4 1B 62 result Results 1 Thistle\n", + "5 1C 63 goal Results 2 LightGray\n", + "6 1C 64 method Results 2 LightGray\n", + "7 1C 65 method Results 2 LightGray\n", + "8 1C 66 fact Results 2 Snow\n", + "9 1C 67 fact Results 2 Snow\n", + "10 1C 68 result Results 2 Thistle\n", + "11 1C 69 method Results 2 LightGray\n", + "12 1C 70 result Results 2 Thistle\n", + "13 1C 71 result Results 2 Thistle\n", + "14 1C 72 result Results 2 Thistle\n", + "15 1C 73 result Results 2 Thistle\n", + "16 2A 91 goal Results 3 LightGray\n", + "17 2A 92 method Results 3 LightGray\n", + "18 2A 93 method Results 3 LightGray\n", + "19 2A 94 result Results 3 Thistle\n", + "20 2A 95 method Results 3 LightGray\n", + "21 2A 96 implication Results 3 Plum\n", + "22 2D 99 hypothesis Results 6 Snow\n", + "23 3B 99 hypothesis Results 8 Snow\n", + "24 3A 99 hypothesis Results 7 Snow\n", + "25 2B 99 hypothesis Results 4 Snow\n", + "26 2C 99 hypothesis Results 5 Snow\n", + "30 2B 100 method Results 4 LightGray\n", + "29 3A 100 method Results 7 LightGray\n", + "31 2C 100 method Results 5 LightGray\n", + ".. ... ... ... ... ... ...\n", + "100 7C 184 hypothesis Results 15 Snow\n", + "101 7C 185 hypothesis Results 15 Snow\n", + "102 7C 186 fact Results 15 Snow\n", + "103 7C 187 fact Results 15 Snow\n", + "104 7C 188 hypothesis Results 15 Snow\n", + "105 7C 189 fact Results 15 Snow\n", + "106 7C 190 method Results 15 LightGray\n", + "107 7C 191 method Results 15 LightGray\n", + "108 7C 192 method Results 15 LightGray\n", + "109 7C 193 result Results 15 Thistle\n", + "110 7C 194 implication Results 15 Plum\n", + "111 7C 195 result Results 15 Thistle\n", + "112 7C 196 result Results 15 Thistle\n", + "113 7C 197 implication Results 15 Plum\n", + "114 7C 198 implication Results 15 Plum\n", + "115 8B 202 hypothesis Results 17 Snow\n", + "116 8A 202 hypothesis Results 16 Snow\n", + "117 8B 203 hypothesis Results 17 Snow\n", + "118 8A 203 hypothesis Results 16 Snow\n", + "119 8B 204 hypothesis Results 17 Snow\n", + "120 8A 204 hypothesis Results 16 Snow\n", + "121 8B 205 result Results 17 Thistle\n", + "122 8A 205 result Results 16 Thistle\n", + "123 8B 206 result Results 17 Thistle\n", + "124 8A 206 result Results 16 Thistle\n", + "125 8A 209 result Results 16 Thistle\n", + "126 8C 210 result Results 18 Thistle\n", + "127 8D 214 method Results 19 LightGray\n", + "128 8D 215 result Results 19 Thistle\n", + "129 8D 216 result Results 19 Thistle\n", + "\n", + "[130 rows x 6 columns]\n" + ] + } + ], + "source": [ + "gantt_rows = []\n", + "gantt2_rows = []\n", + "\n", + "dtypes = [\"fact\",\"hypothesis\",\"problem\",\"goal\" ,\"method\",\"result\",\"implication\"]\n", + "colors = [\"Snow\" ,\"Snow\" ,\"Snow\" ,\"LightGray\",\"LightGray\" ,\"Thistle\" ,\"Plum\"] \n", + "colors_s = pd.Series(colors, index=dtypes)\n", + "\n", + "all_codes = Set() \n", + "\n", + "clause_max = -1\n", + "clause_min = 1000\n", + "\n", + "for i,row in tsv.iterrows():\n", + " es = row['ExperimentValues']\n", + " exptSpan = row['ExperimentSpan']\n", + " dt = row['Discourse Type']\n", + " sid = row['SentenceId']\n", + " paragraph = row['Paragraph']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + "\n", + " if( heading != heading ):\n", + " heading = \"\"\n", + "\n", + " #if(not floatingBox):\n", + " # clause_max = i\n", + " \n", + " if( re.match('^Result', heading) is None or floatingBox):\n", + " continue\n", + " \n", + " if( i > clause_max):\n", + " clause_max = i\n", + " if( i < clause_min):\n", + " clause_min = i\n", + " \n", + " codes = read_codes(es) \n", + " for c in codes:\n", + " gantt_rows.append([c, i, dt, heading])\n", + " all_codes.add(c)\n", + " \n", + " spanCodes = read_codes(exptSpan)\n", + " for c in spanCodes:\n", + " gantt2_rows.append([c, i, dt, heading])\n", + " \n", + "codes_s = pd.Series(range(len(all_codes)), index=sorted(list(all_codes)))\n", + "\n", + "gantt_df = pd.DataFrame.from_records(gantt_rows, columns=['expt','clause_id','discourse_type', 'heading']) \n", + "gantt_df = gantt_df.sort(columns=['clause_id'], ascending=True)\n", + "\n", + "gantt2_df = pd.DataFrame.from_records(gantt2_rows, columns=['expt','clause_id','discourse_type', 'heading']) \n", + "gantt2_df = gantt2_df.sort(columns=['clause_id'], ascending=True)\n", + "\n", + "#print(codes_s.loc[gantt_df['expt'].tolist()].tolist())\n", + "\n", + "gantt_df['expt_id'] = codes_s.loc[gantt_df['expt'].tolist()].tolist()\n", + "gantt2_df['expt_id'] = codes_s.loc[gantt2_df['expt'].tolist()].tolist()\n", + "\n", + "gantt2_df['color'] = colors_s.loc[gantt2_df['discourse_type'].tolist()].tolist()\n", + "\n", + "print(gantt2_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 359, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

<Bokeh Notebook handle for In[359]>

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 359, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G=figure(title=pmcId, width=800, height=600, \n", + " x_range=Range1d(clause_min, clause_max), y_range=list(codes_s.index.values))\n", + "G.xaxis.axis_label=\"Clause #\"\n", + "G.yaxis.axis_label=\"Figure Code\"\n", + "\n", + "gantt2_df['top']=gantt2_df['expt_id']+0.8\n", + "gantt2_df['bottom']=gantt2_df['expt_id']+1.2\n", + "gantt2_df['left']=gantt2_df['clause_id']-0.5\n", + "gantt2_df['right']=gantt2_df['clause_id']+0.5\n", + "\n", + "cds2 = ColumnDataSource(gantt2_df)\n", + "G.quad(left='left', right='right', bottom='bottom', top='top',source=cds2, line_color=\"gray\", color='color')\n", + "\n", + "cds = ColumnDataSource(gantt_df)\n", + "G.scatter('clause_id', 'expt', source=cds, marker='x', size=15,\n", + " line_color=\"red\", fill_color=\"red4\")\n", + "\n", + "#G.rect(,\"Item\",source=CDS)\n", + "show(G)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.ipynb_checkpoints/INTACT Processing Scripts-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/INTACT Processing Scripts-checkpoint.ipynb new file mode 100644 index 0000000..e696f48 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/INTACT Processing Scripts-checkpoint.ipynb @@ -0,0 +1,2160 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load a list of open access PMID files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\n", + "from subprocess import call\n", + "\n", + "pmids = []\n", + "pmid_file = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pmids.txt\"\n", + "\n", + "with open(pmid_file) as f:\n", + " for line in f.readlines():\n", + " pmids.append(line.strip())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Shell commands to build the zipped bundle. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# echo -n application/vnd.wf4ever.robundle+zip > mimetype\n", + "# zip -0 -X ../reach mimetype\n", + "# zip -X -r ../reach . -x mimetype\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the sentences from each paper processed by SciDT into simple sentences for each Figure assignment." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "def retrieve_sentences_for_modeling(inFile, fid):\n", + " \n", + " tsv = pd.read_csv(inFile, sep='\\t')\n", + " fig_tagged_sentences = {}\n", + "\n", + " for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + " fig = row['Figure Assignment']\n", + " offset_start = row['Offset_Begin']\n", + " offset_end = row['Offset_End']\n", + " \n", + " if fig == fig:\n", + " for f in fig.split('|'):\n", + " if( fig_tagged_sentences.get(f, None) is None ):\n", + " sent_list = []\n", + " fig_tagged_sentences[f] = sent_list\n", + " sent_list.append({'sid': sid, 'pid':paragraph, \n", + " 'start': offset_start, 'end': offset_end, 'text': text,\n", + " 'discourse_types': discourse})\n", + " else:\n", + " sent_list = fig_tagged_sentences[f]\n", + " sent_list.append({'sid': sid, 'pid':paragraph, \n", + " 'start': offset_start, 'end': offset_end, 'text': text,\n", + " 'discourse_types': discourse})\n", + " \n", + " \n", + " return fig_tagged_sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10087260\n", + "10087263\n", + "10087265\n", + "10209036\n", + "10225955\n", + "10366597\n", + "10366599\n", + "10385523\n", + "10385526\n", + "10402465\n", + "10429675\n", + "10545507\n", + "10562275\n", + "10562277\n", + "10562279\n", + "10562288\n", + "10601328\n", + "10601346\n", + "10613896\n", + "10620603\n", + "10629222\n", + "10648568\n", + "10662770\n", + "10684247\n", + "10704439\n", + "10704444\n", + "10704446\n", + "10725331\n", + "10725334\n", + "10747088\n", + "10747089\n", + "10790433\n", + "10811823\n", + "10831611\n", + "10859335\n", + "10864201\n", + "10871282\n", + "10900456\n", + "10931856\n", + "10931877\n", + "10953014\n", + "10974003\n", + "10995436\n", + "11018051\n", + "11018064\n", + "11034606\n", + "11038172\n", + "11038182\n", + "11076969\n", + "11086001\n", + "11134073\n", + "11149930\n", + "11157975\n", + "11157979\n", + "11157984\n", + "11181702\n", + "11257119\n", + "11266443\n", + "11266449\n", + "11266451\n", + "11309418\n", + "11401320\n", + "11402059\n", + "11448995\n", + "11502761\n", + "11514608\n", + "11564755\n", + "11570975\n", + "11571312\n", + "11591728\n", + "11591731\n", + "11684708\n", + "11724822\n", + "11739401\n", + "11739402\n", + "11739404\n", + "11747467\n", + "11756480\n", + "11777938\n", + "11846885\n", + "11877480\n", + "11914126\n", + "11916981\n", + "11927603\n", + "11927608\n", + "12011112\n", + "12147674\n", + "12167173\n", + "12199906\n", + "12421467\n", + "12446742\n", + "12473693\n", + "12486103\n", + "12486115\n", + "12507995\n", + "12527750\n", + "12566426\n", + "12642614\n", + "12682088\n", + "12689351\n", + "12719471\n", + "12771128\n", + "12782684\n", + "12847081\n", + "12900395\n", + "12939254\n", + "14517205\n", + "14568990\n", + "14612908\n", + "14638857\n", + "14707117\n", + "14709540\n", + "14734533\n", + "14737190\n", + "14970179\n", + "15037601\n", + "15045029\n", + "15051809\n", + "15070402\n", + "15078903\n", + "15096524\n", + "15148308\n", + "15159416\n", + "15302858\n", + "15314064\n", + "15326198\n", + "1541635\n", + "15477347\n", + "15504911\n", + "15575970\n", + "15583694\n", + "15642746\n", + "15653635\n", + "15720729\n", + "15767459\n", + "15796781\n", + "15828860\n", + "15883195\n", + "15928207\n", + "15967037\n", + "16000169\n", + "16027220\n", + "16043514\n", + "16043515\n", + "16061695\n", + "16087707\n", + "16098226\n", + "16115959\n", + "16166655\n", + "16179646\n", + "16179649\n", + "16203867\n", + "16254079\n", + "16301747\n", + "16356270\n", + "16396833\n", + "16396834\n", + "16403219\n", + "16415179\n", + "16449187\n", + "16492808\n", + "16513846\n", + "16520382\n", + "16545136\n", + "16603075\n", + "16606443\n", + "16618814\n", + "16636147\n", + "16638120\n", + "16672054\n", + "16717130\n", + "16729043\n", + "16754960\n", + "16839418\n", + "16847100\n", + "16872538\n", + "16880273\n", + "16893970\n", + "16923827\n", + "16945160\n", + "16982639\n", + "16990252\n", + "17000877\n", + "17030985\n", + "17085477\n", + "17112379\n", + "17151076\n", + "17183697\n", + "17224084\n", + "17280616\n", + "17284314\n", + "17341466\n", + "17353368\n", + "17353931\n", + "17407569\n", + "17412707\n", + "17470632\n", + "17485491\n", + "17485524\n", + "17500595\n", + "17511879\n", + "17543119\n", + "17557078\n", + "17581628\n", + "17591856\n", + "17605817\n", + "17608567\n", + "17612402\n", + "17620405\n", + "17620407\n", + "17623094\n", + "17627824\n", + "17650322\n", + "17660750\n", + "17660751\n", + "17667950\n", + "17690686\n", + "17721441\n", + "17724128\n", + "17762866\n", + "17889823\n", + "17936057\n", + "17937504\n", + "17948060\n", + "17986458\n", + "18000013\n", + "18034155\n", + "18086859\n", + "18154663\n", + "18157088\n", + "18171471\n", + "18188153\n", + "18188154\n", + "18208323\n", + "18226242\n", + "18239682\n", + "18256700\n", + "18266467\n", + "18268103\n", + "18286207\n", + "18289379\n", + "18292755\n", + "18301737\n", + "18309292\n", + "18309293\n", + "18309296\n", + "18320063\n", + "18354501\n", + "18377662\n", + "18388858\n", + "18394558\n", + "18412956\n", + "18421166\n", + "18430226\n", + "18433452\n", + "18435708\n", + "18447585\n", + "18452624\n", + "18458160\n", + "18466225\n", + "18479511\n", + "18497748\n", + "18498651\n", + "18498752\n", + "18509523\n", + "18511940\n", + "18518979\n", + "18551167\n", + "18560762\n", + "18573912\n", + "18583960\n", + "18583988\n", + "18586827\n", + "18604270\n", + "18612383\n", + "18617507\n", + "18628297\n", + "18628823\n", + "18629017\n", + "18631241\n", + "18647389\n", + "18662404\n", + "18663010\n", + "18665261\n", + "18671868\n", + "18682833\n", + "18703495\n", + "18724936\n", + "18758438\n", + "18761697\n", + "18762578\n", + "18762581\n", + "18775314\n", + "18775702\n", + "18779372\n", + "18781224\n", + "18794331\n", + "18800055\n", + "18802460\n", + "18808384\n", + "18812399\n", + "18818696\n", + "18833289\n", + "18836139\n", + "18923419\n", + "18927618\n", + "18946488\n", + "18953286\n", + "18955484\n", + "18985028\n", + "19008859\n", + "19019158\n", + "19037259\n", + "19055777\n", + "19060904\n", + "19063885\n", + "19079254\n", + "19088080\n", + "19088272\n", + "19107194\n", + "19107203\n", + "19114595\n", + "19118384\n", + "19131970\n", + "19135240\n", + "19150989\n", + "19153600\n", + "19155274\n", + "19156129\n", + "19158676\n", + "19165350\n", + "19167335\n", + "19171758\n", + "19214185\n", + "19229298\n", + "19277118\n", + "19290556\n", + "19322195\n", + "19322197\n", + "19329994\n", + "19360002\n", + "19369943\n", + "19407811\n", + "19419567\n", + "19432797\n", + "19440292\n", + "19440376\n", + "19455133\n", + "19486527\n", + "19494831\n", + "19498462\n", + "19498465\n", + "19506933\n", + "19520861\n", + "19521502\n", + "19523115\n", + "19536134\n", + "19536198\n", + "19543227\n", + "19567478\n", + "19570034\n", + "19570982\n", + "19575010\n", + "19590579\n", + "19609305\n", + "19616007\n", + "19619546\n", + "19625296\n", + "19629177\n", + "19635168\n", + "19636380\n", + "19648646\n", + "19680228\n", + "19680552\n", + "19682256\n", + "19690564\n", + "19696784\n", + "19701182\n", + "19701191\n", + "19704022\n", + "19730435\n", + "19730696\n", + "19736317\n", + "19746159\n", + "19763081\n", + "19765300\n", + "19767740\n", + "19781631\n", + "19797078\n", + "19798056\n", + "19798101\n", + "19807924\n", + "19841731\n", + "19874541\n", + "19887001\n", + "19888460\n", + "19888464\n", + "19893485\n", + "19893486\n", + "19893489\n", + "19893491\n", + "19903340\n", + "19927124\n", + "19933256\n", + "19933576\n", + "19934257\n", + "19934264\n", + "19940019\n", + "19941819\n", + "19941825\n", + "19942852\n", + "19959993\n", + "19959995\n", + "19996314\n", + "20007317\n", + "20037628\n", + "20043912\n", + "20071408\n", + "20075079\n", + "20075868\n", + "20094031\n", + "20098747\n", + "20110348\n", + "20111005\n", + "20123736\n", + "20129940\n", + "20140193\n", + "20141835\n", + "20169075\n", + "20169078\n", + "20169165\n", + "20174651\n", + "20178605\n", + "20186120\n", + "20205919\n", + "20211136\n", + "20214800\n", + "20224550\n", + "20231380\n", + "20300060\n", + "20305656\n", + "20308429\n", + "20338032\n", + "20353594\n", + "20360680\n", + "20362541\n", + "20362542\n", + "20368803\n", + "20371544\n", + "20375098\n", + "20388642\n", + "20399778\n", + "20400938\n", + "20410134\n", + "20418871\n", + "20418951\n", + "20434988\n", + "20436455\n", + "20439537\n", + "20453830\n", + "20456499\n", + "20467438\n", + "20471980\n", + "20512112\n", + "20529865\n", + "20540776\n", + "20543819\n", + "20559324\n", + "20561531\n", + "20562859\n", + "20574810\n", + "20579338\n", + "20581830\n", + "20584916\n", + "20594350\n", + "20601937\n", + "20603002\n", + "20603614\n", + "20624308\n", + "20628654\n", + "20639901\n", + "20639902\n", + "20642453\n", + "20657822\n", + "20659021\n", + "20664520\n", + "20676093\n", + "20676095\n", + "20676135\n", + "20686606\n", + "20693977\n", + "20697347\n", + "20697357\n", + "20700126\n", + "20706207\n", + "20711500\n", + "20729920\n", + "20738866\n", + "20802085\n", + "20802534\n", + "20802536\n", + "20817927\n", + "20818336\n", + "20818435\n", + "20819940\n", + "20840750\n", + "20843328\n", + "20856196\n", + "20856200\n", + "20856870\n", + "20862261\n", + "20865124\n", + "20871633\n", + "20881089\n", + "20890303\n", + "20890305\n", + "20920251\n", + "20924358\n", + "20929568\n", + "20929579\n", + "20932347\n", + "20935634\n", + "20935647\n", + "20935677\n", + "20936779\n", + "20953186\n", + "20969766\n", + "20972225\n", + "20972459\n", + "20976523\n", + "21034468\n", + "21037577\n", + "21047798\n", + "21048921\n", + "21048939\n", + "21057456\n", + "21057510\n", + "21063388\n", + "21078624\n", + "21092281\n", + "21092292\n", + "21110861\n", + "21112398\n", + "21113127\n", + "21114864\n", + "21118991\n", + "21119599\n", + "21119626\n", + "21124868\n", + "21124943\n", + "21131964\n", + "21131965\n", + "21131967\n", + "21132010\n", + "21139566\n", + "21147767\n", + "21148288\n", + "21149568\n", + "21151104\n", + "21157431\n", + "2116421\n", + "21170087\n", + "21172016\n", + "21179004\n", + "21179020\n", + "21179510\n", + "2118142\n", + "21186367\n", + "21187329\n", + "21203429\n", + "21203436\n", + "21209940\n", + "21212461\n", + "2121740\n", + "21217644\n", + "21217774\n", + "21219645\n", + "21220045\n", + "21224849\n", + "21224850\n", + "21242965\n", + "21242966\n", + "21242980\n", + "21245844\n", + "21247419\n", + "21251231\n", + "21252856\n", + "21274006\n", + "21277013\n", + "21278383\n", + "21278420\n", + "21278786\n", + "21288885\n", + "21297662\n", + "21306563\n", + "21311558\n", + "21314951\n", + "21317875\n", + "21328542\n", + "21335238\n", + "21336258\n", + "21338522\n", + "21347350\n", + "21364888\n", + "21386817\n", + "21386897\n", + "21390248\n", + "21399620\n", + "21399639\n", + "21399666\n", + "21407176\n", + "21407215\n", + "21408167\n", + "21415856\n", + "21423209\n", + "21423216\n", + "21427704\n", + "21439629\n", + "21445305\n", + "21447707\n", + "21454693\n", + "21498514\n", + "21505799\n", + "21507240\n", + "21516116\n", + "21525870\n", + "21525958\n", + "21526181\n", + "21533037\n", + "21541365\n", + "21556049\n", + "21559518\n", + "21569246\n", + "21575178\n", + "21575199\n", + "21577200\n", + "21586138\n", + "21602887\n", + "21613545\n", + "21625644\n", + "21637789\n", + "21642953\n", + "21643011\n", + "21668996\n", + "21669201\n", + "21679440\n", + "21685908\n", + "21685939\n", + "21685944\n", + "21689417\n", + "21698133\n", + "21701560\n", + "21705390\n", + "21706061\n", + "21718540\n", + "21725360\n", + "21725367\n", + "21734647\n", + "21743437\n", + "21743479\n", + "21743491\n", + "21747946\n", + "21781306\n", + "21798038\n", + "21804533\n", + "21811563\n", + "21822214\n", + "21847096\n", + "21847100\n", + "21857973\n", + "21871133\n", + "21874024\n", + "21875956\n", + "21880142\n", + "21884581\n", + "21890893\n", + "21893585\n", + "21903581\n", + "21908610\n", + "21909133\n", + "21909281\n", + "21931555\n", + "21931591\n", + "21943085\n", + "21946559\n", + "21946560\n", + "21952049\n", + "21964608\n", + "21988832\n", + "21998301\n", + "22010978\n", + "22014111\n", + "22016384\n", + "22022230\n", + "22022540\n", + "22027862\n", + "22028648\n", + "22034500\n", + "22046270\n", + "22048310\n", + "22056778\n", + "22056872\n", + "22057290\n", + "22059385\n", + "22068330\n", + "22072986\n", + "22087277\n", + "22094269\n", + "22096563\n", + "22102817\n", + "22116401\n", + "22118466\n", + "22118625\n", + "22135285\n", + "22157895\n", + "22162999\n", + "22163275\n", + "22174692\n", + "22174833\n", + "22207579\n", + "22238662\n", + "22242148\n", + "22269274\n", + "22270917\n", + "22279592\n", + "22280843\n", + "22291595\n", + "22303461\n", + "22323290\n", + "22323517\n", + "22325148\n", + "22334672\n", + "22355679\n", + "22363216\n", + "22382979\n", + "22387996\n", + "22401567\n", + "22402981\n", + "22404908\n", + "22406378\n", + "22406686\n", + "22413019\n", + "22442151\n", + "22446626\n", + "22447027\n", + "22458338\n", + "22470507\n", + "22471946\n", + "22491013\n", + "22493164\n", + "22493500\n", + "22500027\n", + "22510880\n", + "22510882\n", + "22518138\n", + "22540012\n", + "22555292\n", + "22575651\n", + "22581261\n", + "22609302\n", + "22613832\n", + "22623428\n", + "22634751\n", + "22648170\n", + "22651821\n", + "22662192\n", + "22674187\n", + "22685417\n", + "22731636\n", + "22791023\n", + "22808155\n", + "22829933\n", + "22842785\n", + "22850675\n", + "22863774\n", + "22892566\n", + "22899650\n", + "22904065\n", + "22905162\n", + "22908322\n", + "22916011\n", + "22939623\n", + "22940692\n", + "22952686\n", + "22952718\n", + "22962574\n", + "22962849\n", + "22966907\n", + "22977175\n", + "23022564\n", + "23023393\n", + "23042150\n", + "23056421\n", + "23065768\n", + "23075850\n", + "23082202\n", + "23082758\n", + "23085988\n", + "23086447\n", + "23086448\n", + "23088713\n", + "23104095\n", + "23104097\n", + "23142775\n", + "23143267\n", + "23161686\n", + "23170778\n", + "23183827\n", + "23209657\n", + "23216645\n", + "23217712\n", + "23236467\n", + "23253866\n", + "23263555\n", + "23275563\n", + "23284848\n", + "23316280\n", + "23332754\n", + "23353684\n", + "23353889\n", + "23369005\n", + "23369981\n", + "23395900\n", + "23395907\n", + "23399914\n", + "23403925\n", + "23405092\n", + "23414517\n", + "23431397\n", + "23449449\n", + "23455607\n", + "23463101\n", + "23467085\n", + "23505436\n", + "23511972\n", + "23514585\n", + "23520446\n", + "23533724\n", + "23549287\n", + "23549480\n", + "23555304\n", + "23565095\n", + "23582324\n", + "23582331\n", + "23585889\n", + "23593007\n", + "23621612\n", + "23622247\n", + "23634843\n", + "23650535\n", + "23658700\n", + "23667408\n", + "23675303\n", + "23680104\n", + "23685356\n", + "23693014\n", + "23706742\n", + "23708798\n", + "23725059\n", + "23734815\n", + "23737971\n", + "23741051\n", + "23750211\n", + "23752268\n", + "23758976\n", + "23772379\n", + "23773523\n", + "23782464\n", + "23788678\n", + "23799140\n", + "23799367\n", + "23823123\n", + "23829672\n", + "23840630\n", + "23840749\n", + "23840900\n", + "23855374\n", + "23857585\n", + "23861867\n", + "23866081\n", + "23890821\n", + "23902751\n", + "23907583\n", + "23909438\n", + "23910724\n", + "2391361\n", + "23918937\n", + "23933751\n", + "23935490\n", + "23935497\n", + "23940795\n", + "23948297\n", + "23949442\n", + "23967200\n", + "23979715\n", + "24001151\n", + "24006493\n", + "24009510\n", + "24009866\n", + "24034246\n", + "24035192\n", + "24056303\n", + "24063750\n", + "24065129\n", + "24069158\n", + "24069330\n", + "24069433\n", + "24075010\n", + "24076655\n", + "24076656\n", + "24083380\n", + "24086303\n", + "24090070\n", + "24094005\n", + "24098548\n", + "24113872\n", + "24117850\n", + "24125847\n", + "24145797\n", + "24161670\n", + "24167781\n", + "24176932\n", + "24189400\n", + "24191246\n", + "24223725\n", + "24240174\n", + "24243021\n", + "24244371\n", + "24263861\n", + "24269683\n", + "24274578\n", + "24275654\n", + "24282027\n", + "24286120\n", + "24311597\n", + "24314029\n", + "24330623\n", + "24344185\n", + "24349196\n", + "24349490\n", + "24365180\n", + "24366813\n", + "24374083\n", + "24397932\n", + "24416391\n", + "24434184\n", + "24473148\n", + "24502362\n", + "24515439\n", + "24527098\n", + "24555568\n", + "24561554\n", + "24563863\n", + "24566989\n", + "24568222\n", + "24582333\n", + "24587342\n", + "24610369\n", + "24618038\n", + "24618592\n", + "24626987\n", + "24643253\n", + "24651726\n", + "24656813\n", + "24722491\n", + "24754922\n", + "24798445\n", + "24823443\n", + "24835508\n", + "24843023\n", + "24847877\n", + "24855951\n", + "24872509\n", + "24879895\n", + "24904275\n", + "24914955\n", + "24937146\n", + "24960027\n", + "24960071\n", + "24963139\n", + "24964212\n", + "24983867\n", + "25009464\n", + "25147953\n", + "25159688\n", + "25170085\n", + "25171412\n", + "25225338\n", + "25260594\n", + "25260751\n", + "25277244\n", + "25278935\n", + "25294836\n", + "25294943\n", + "25314077\n", + "25321483\n", + "25360523\n", + "25374563\n", + "25425574\n", + "25445562\n", + "25473596\n", + "25519916\n", + "25533335\n", + "25609649\n", + "25653167\n", + "25697406\n", + "25767811\n", + "7528772\n", + "7561682\n", + "7593161\n", + "7807015\n", + "7844150\n", + "8253836\n", + "8551220\n", + "8609167\n", + "8627166\n", + "8627180\n", + "8666671\n", + "8666672\n", + "8691146\n", + "8691154\n", + "8707857\n", + "8858162\n", + "8922390\n", + "9060478\n", + "9128257\n", + "9151673\n", + "9214383\n", + "9214386\n", + "9334338\n", + "9412461\n", + "9425168\n", + "9472029\n", + "9531549\n", + "9531566\n", + "9628892\n", + "9660868\n", + "9700171\n", + "9763420\n", + "9786960\n", + "9813092\n", + "9817749\n", + "9864353\n", + "9864360\n", + "9922454\n", + "9971739\n" + ] + } + ], + "source": [ + "tsv_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4\"\n", + "sentence_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/fig_sentences\"\n", + "\n", + "for root, dirs, files in os.walk(tsv_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.tsv' :\n", + " pmid = file[:-4]\n", + " if( pmid in pmids ):\n", + " print( pmid )\n", + " fig_tagged_sentences = retrieve_sentences_for_modeling(root+'/'+file, pmid)\n", + " for fig in fig_tagged_sentences.keys():\n", + " out = open(sentence_dir+'/'+pmid+'_'+fig+'.txt', 'w')\n", + " for sent_hash in fig_tagged_sentences[fig]:\n", + " out.write(sent_hash['text'] + '\\n')\n", + " out.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Functions to simplify INTACT records from their standard XML into TSV format. " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "import re\n", + "\n", + "def build_figure_extraction_patterns():\n", + " bf = \"\\s*f(igs|igs\\.|ig|ig\\.|igure|\\.|ig\\:){0,1}\"\n", + " d = \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)\\s*\\.{0,1}\\s*\"\n", + " d_split = \"\\s*(\\d*)\\s*[\\.\\;\\,]{0,1}\\s*([a-z]*)\"\n", + " interval = \"\\s*(\\d+)([a-z]+)\\\\-([a-z]+)\"\n", + " pattHash = {} \n", + " \n", + " figPatt = []\n", + " pattHash['figPatt'] = figPatt\n", + " \n", + " # 0. No alphanumeric codes at all: 'Figure. 1; more text'\n", + " figPatt.append(re.compile(\"^\" + bf + d + \"$\")) \n", + " figPatt.append(re.compile(\"^\" + bf + \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)[\\,\\;\\.]{0,1}\\s*t\"))\n", + " figPatt.append(re.compile(\"^\" + bf + \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)[\\,\\;\\.]{0,1}\\s*s\"))\n", + " figPatt.append(re.compile(\"^\" + bf + \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)[\\,\\;\\.]{0,1}\\s+and\\s+s\"))\n", + " \n", + " # [1]\n", + " simplePatt = re.compile(\"^\" + d + \"$\");\n", + " pattHash['simplePatt'] = simplePatt\n", + " \n", + " # [2,4] \n", + " space2Patt = re.compile(\"^\" + bf + d + \"\\s+\" + bf + d + \"$\");\n", + " pattHash['space2Patt'] = space2Patt\n", + "\n", + " # [2,4,6] \n", + " space3Patt = re.compile(\"^\"+bf+d+\"\\s+\"+bf+d+\"\\s+\"+bf+d+\"$\");\n", + " pattHash['space3Patt'] = space3Patt\n", + "\n", + " # [2,4]\n", + " fullComma2Patt = re.compile(\"^\" + bf + d + \"[\\;\\,]\" + bf + d + \"$\")\n", + " pattHash['fullComma2Patt'] = fullComma2Patt\n", + " \n", + " # [2,3]\n", + " comma2Patt = re.compile(\"^\" + bf + d + \"[\\;\\,]\" + d + \"$\")\n", + " pattHash['comma2Patt'] = comma2Patt\n", + "\n", + " # [1,2]\n", + " simpleComma2Patt = re.compile(\"^\" + d + \"[\\;\\,]\" + d + \"$\")\n", + " pattHash['simpleComma2Patt'] = simpleComma2Patt\n", + "\n", + " # [2,3,4]\n", + " comma3Patt = re.compile(\"^\" + bf + d + \"[\\;\\,]\" + d + \"[\\;\\,]\" + d + \"$\");\n", + " pattHash['comma3Patt'] = comma3Patt\n", + " \n", + " # [1,2,3]\n", + " simpleComma3Patt = re.compile(\"^\" + d + \"[\\;\\,]\" + d + \"[\\;\\,]\" + d + \"$\");\n", + " pattHash['simpleComma3Patt'] = simpleComma3Patt\n", + "\n", + " # [2,3,4,5]\n", + " comma4Patt = re.compile(\"^\"+bf+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"$\");\n", + " pattHash['comma4Patt'] = comma4Patt\n", + "\n", + " # [2,3,4,5,6]\n", + " comma5Patt = re.compile(\"^\"+bf+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"$\");\n", + " pattHash['comma5Patt'] = comma5Patt\n", + "\n", + " # [1,2,3,4]\n", + " simpleComma4Patt = re.compile(\"^\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"$\");\n", + " pattHash['simpleComma4Patt'] = simpleComma4Patt\n", + "\n", + " # [2,3]\n", + " and2Patt = re.compile(\"^\" + bf + d + \"\\s+and\\s+\" + d + \"$\");\n", + " pattHash['and2Patt'] = and2Patt\n", + " \n", + " # [1,2]\n", + " simpleAnd2Patt = re.compile(\"^\" + d + \"\\s+and\\s+\" + d + \"$\");\n", + " pattHash['simpleAnd2Patt'] = simpleAnd2Patt\n", + "\n", + " # [1,2,3]\n", + " simple_a_and_b_patt = re.compile(\"^\" + d_split + \"\\s+and\\s+([a-z])$\");\n", + " pattHash['simple_a_and_b_patt'] = simple_a_and_b_patt\n", + "\n", + " # [2,3,4]\n", + " a_and_b_patt = re.compile(\"^\" + bf + d_split + \"\\s+and\\s+([a-z])$\");\n", + " pattHash['a_and_b_patt'] = a_and_b_patt\n", + "\n", + " # [1,2,3]\n", + " simple_a_comma_b_patt = re.compile(\"^\" + d_split + \"[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['simple_a_comma_b_patt'] = simple_a_comma_b_patt\n", + "\n", + " # [2,3,4]\n", + " a_comma_b_patt = re.compile(\"^\"+bf+d_split+\"[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['a_comma_b_patt'] = a_comma_b_patt\n", + "\n", + " # [1,2,3]\n", + " simple_a_comma_b_comma_c_patt = re.compile(\"^\" + d_split + \"[\\;\\,]\\s*([a-z])\\s*[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['simple_a_comma_b_comma_c_patt'] = simple_a_comma_b_comma_c_patt\n", + "\n", + " # [2,3,4]\n", + " a_comma_b_comma_c_patt = re.compile(\"^\"+bf+d_split+\"[\\;\\,]\\s*([a-z])\\s*[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['a_comma_b_comma_c_patt'] = a_comma_b_comma_c_patt\n", + "\n", + " # [2,3,4,5]\n", + " a_b_and_c_patt = re.compile(\"^\" + bf + d_split + \"[\\;\\,]\\s+([a-z])\\s+and\\s+([a-z])$\");\n", + " pattHash['a_b_and_c_patt'] = a_b_and_c_patt\n", + "\n", + " # [1,2,3,4]\n", + " simple_a_b_and_c_patt = re.compile(\"^\" + d_split + \"[\\;\\,]\\s+([a-z])\\s+and\\s+([a-z])$\");\n", + " pattHash['simple_a_b_and_c_patt'] = simple_a_b_and_c_patt\n", + "\n", + " tableFigPatt = re.compile(\"^t(ab\\.|ab|able){0,1}.*\" + bf + d + \"$\");\n", + " pattHash['tableFigPatt'] = tableFigPatt\n", + "\n", + " intervalPatt = re.compile(\"^\" + bf + interval + \"$\");\n", + " pattHash['intervalPatt'] = intervalPatt\n", + "\n", + " # simple single table (table 1, t1, tab. 1a)\n", + " # returned value is second group\n", + " tablePatt = re.compile(\"^t(ab\\.|ab|able){0,1}\\s*([\\di]+[a-z]{0,1})[\\,\\;\\.]{0,1}$\");\n", + " pattHash['tablePatt'] = tablePatt\n", + "\n", + " # simple single table (table 1, t1, tab. 1a)\n", + " # returned value is third group\n", + " suppTablePatt = re.compile(\"^s(upp|upp.|lementary){0,1}\\s*t(ab\\.|ab|able){0,1}\\s*([i\\d]+[a-z]{0,1})[\\,\\;\\.]{0,1}$\");\n", + " pattHash['suppTablePatt'] = suppTablePatt\n", + " \n", + " return pattHash\n", + "\n", + "def run_simple_matcher(fig_text, patt_hash, patt_code, groups=[1]):\n", + " match = re.search(patt_hash.get(patt_code), fig_text)\n", + " results = []\n", + " if( match ) :\n", + " for g in groups:\n", + " results.append(match.group(g))\n", + " return results\n", + " else:\n", + " return None\n", + "\n", + "def build_matched_string(matched_list,code):\n", + " matched_str = \"\"\n", + " for mf in matched_list:\n", + " if len(matched_str) > 0 :\n", + " matched_str += '|'\n", + " matched_str += code + mf.replace(\" \", \"\").replace(\".\", \"\")\n", + " return matched_str\n", + "\n", + "def run_matcher(fig_text, patt_hash):\n", + " \n", + " if(fig_text == 'nfa' ):\n", + " return None\n", + " \n", + " # strip out all parentheses.\n", + " paren_patt = re.compile(\"(\\(.+?\\))\")\n", + " fig_text = re.sub(paren_patt, \"\", fig_text)\n", + "\n", + " # covert & to 'and'.\n", + " fig_text = fig_text.replace(\"&\", \"and\")\n", + " \n", + " fig_patt = patt_hash.get('figPatt')\n", + " for p in fig_patt:\n", + " match = re.search(p, fig_text)\n", + " if match:\n", + " return 'f' + match.group(2).replace(\" \",\"\").replace(\".\",\"\").replace(\",\",\"\")\n", + " \n", + " # [1] simplePatt\n", + " # [2,4] space2Patt\n", + " # [2,4,6] space3Patt\n", + " # [2,4] fullComma2Patt\n", + " # [2,3] comma2Patt\n", + " # [1,2] simpleComma2Patt\n", + " # [2,3,4] comma3Patt \n", + " # [1,2,3] simpleComma3Patt\n", + " # [2,3,4,5] comma4Patt\n", + " # [1,2,3,4] simpleComma4Patt\n", + " # [1,2] simpleAnd2Patt\n", + " # [1,2,3] simple_a_comma_b_patt \n", + " # [2,3,4] a_comma_b_patt \n", + " # [2,3,4,5] a_b_and_c_patt \n", + " # [1,2,3,4] simple_a_b_and_c_patt\n", + " \n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simplePatt', [1])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'tableFigPatt', [3])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma2Patt', [2,3])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'fullComma2Patt', [2,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma2Patt', [1,2])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma3Patt', [2,3,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma3Patt', [1,2,3])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma4Patt', [2,3,4,5])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma4Patt', [1,2,3,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma5Patt', [2,3,4,5,6])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'space2Patt', [2,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'space3Patt', [2,4,6])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleAnd2Patt', [1,2])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'and2Patt', [2,3])\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_comma_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_comma_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_and_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_and_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_b_and_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " c = match.group(5)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_b_and_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " c = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_comma_b_comma_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " c = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_comma_b_comma_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " c = match.group(5)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('intervalPatt'), fig_text)\n", + " if( match ):\n", + " fig_number = match.group(2)\n", + " start = match.group(3)\n", + " end = match.group(4)\n", + " if( len(start) > 1 or len(end)>1 ):\n", + " return None\n", + " matched_str = \"\"\n", + " subfigs = [chr(i) for i in range(ord(start),ord(end)+1)] \n", + " for subfig in subfigs :\n", + " if len(matched_str) > 0 :\n", + " matched_str += '|'\n", + " matched_str += 'f' + fig_number + subfig\n", + " return matched_str\n", + " \n", + " if(matched_figs is not None):\n", + " return build_matched_string(matched_figs, 'f')\n", + " \n", + " matched_tab = run_simple_matcher(fig_text, patt_hash, 'tablePatt', [2])\n", + " if(matched_tab is not None):\n", + " return build_matched_string(matched_tab, 't')\n", + "\n", + " matched_tab = run_simple_matcher(fig_text, patt_hash, 'suppTablePatt', [3])\n", + " if(matched_tab is not None):\n", + " return build_matched_string(matched_tab, 'st')\n", + " \n", + " return None\n", + "\n", + "def extract_simple_intact_data(input, title, tsv_output):\n", + " \n", + " with open(input, 'r') as input_file:\n", + " xml = input_file.read()\n", + " \n", + " # Check if the figure legends are specified\n", + " if \"\\\"figure legend\\\"\" not in xml: \n", + " return \n", + " \n", + " soup = BeautifulSoup(xml, 'lxml') \n", + "\n", + " intact_headings = ['pmid','i_id','orig_fig','fig','type','type_xref','p1_name',\n", + " 'p1_xref','p1_site','p2_name','p2_xref','p2_site','p3_name',\n", + " 'p3_xref','p3_site','i_meth','p_meth']\n", + " intact_rows = []\n", + "\n", + " patt_hash = build_figure_extraction_patterns()\n", + "\n", + " # EXPERIMENTS\n", + " all_expt_dict = {}\n", + " for e in soup.select('experimentlist experimentdescription'):\n", + " ex_dict = {}\n", + " ex_dict['i_meth'] = e.interactiondetectionmethod.names.shortlabel.text\n", + " ex_dict['p_meth'] = e.participantidentificationmethod.names.shortlabel.text \n", + " all_expt_dict[e.get('id')] = ex_dict\n", + "\n", + " # INTERACTORS\n", + " all_int_dict = {}\n", + " for i1 in soup.select('interactorlist interactor'):\n", + " int_dict = {}\n", + " int_dict['name'] = i1.names.shortlabel.text\n", + " urls = []\n", + " for t in i1.select('primaryref[db=\"uniprotkb\"]'):\n", + " if( t.get('reftype') == 'identity' ) :\n", + " urls.append(t.get('id'))\n", + " for t in i1.select('secondaryref[db=\"uniprotkb\"]'):\n", + " if( t.get('reftype') == 'identity' ) :\n", + " urls.append(t.get('id'))\n", + " int_dict['xref'] = urls\n", + " all_int_dict[i1.get('id')] = int_dict\n", + "\n", + " # INTERACTIONS\n", + " for i in soup.select('interactionlist interaction'):\n", + " int_dict = {}\n", + " int_dict['pmid'] = title\n", + " int_dict['i_id'] = i.get('id')\n", + " int_dict['type'] = i.interactiontype.names.shortlabel.text \n", + " int_dict['type_xref'] = i.interactiontype.xref.primaryref.get('id')\n", + " p_count = 1\n", + " for p_tag in i.select('participantlist participant'):\n", + " p_id = p_tag.interactorref.text\n", + " p = all_int_dict[p_id]\n", + " int_dict['p'+str(p_count)+\"_name\"] = p.get('name')\n", + " int_dict['p'+str(p_count)+\"_xref\"] = '|'.join(p.get('xref'))\n", + " p_count += 1\n", + " int_dict['fig'] = '-'\n", + " for a in i.select('attributelist attribute[name]'):\n", + " if( a.get('name') == \"figure legend\" ):\n", + " fig_text = a.text.lower()\n", + " fig_text = run_matcher(fig_text, patt_hash)\n", + " if( fig_text is None):\n", + " print(a.text.lower() + \" : None\")\n", + " int_dict['orig_fig'] = a.text\n", + " int_dict['fig'] = fig_text\n", + " e_id = i.experimentlist.experimentref.text\n", + " e = all_expt_dict.get(e_id)\n", + " if( e is not None ):\n", + " int_dict['i_meth'] = e.get('i_meth', '-')\n", + " int_dict['p_meth'] = e.get('p_meth', '-')\n", + " else: \n", + " int_dict['i_meth'] = '-'\n", + " int_dict['p_meth'] = '-'\n", + " \n", + " r = []\n", + " for h in intact_headings:\n", + " r.append(int_dict.get(h,'-'))\n", + " intact_rows.append(r)\n", + " \n", + " intact_df = pd.DataFrame.from_records(intact_rows, columns=intact_headings) \n", + " intact_df.to_csv(tsv_output, sep='\\t', encoding='utf-8')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of code to simplify INTACT records from standard XML into TSV format. " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/simple_intact_files/\n" + ] + } + ], + "source": [ + "stem = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/'\n", + "intact_dir = stem + 'gold_standard/'\n", + "simple_intact_dir = stem + 'simple_intact_files/'\n", + "\n", + "print(simple_intact_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for x in os.walk(intact_dir):\n", + " for infile in glob(os.path.join(x[0], '*.xml')):\n", + " fn = ntpath.basename(infile)\n", + " if( os.path.isfile(infile) and fn.endswith('.xml') ):\n", + " title = fn.replace(\".xml\", \"\")\n", + " if( title not in pmids ):\n", + " continue\n", + "\n", + " outfile = simple_intact_dir + \"/\" + title + \".tsv\"\n", + " if( not os.path.isfile(outfile) ):\n", + " try:\n", + " extract_simple_intact_data(infile, title, outfile)\n", + " except KeyError:\n", + " print(\"KeyError for \" + infile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run this script to convert collections of PSI-MI2.5 files to biopax. We've updated the script to run our updated PaxTools from github.com/BMKEG/Paxtools which includes annotations about Figures in Biopax evidence codes. " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "paxtools_jar = \"/Users/Gully/Coding/git/biopax/Paxtools/paxtools-console/target/paxtools.jar\"\n", + "\n", + "data_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/gold_standard_data\"\n", + "open_access_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/oa_gold_data\"\n", + "biopax_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax\"\n", + "new_biopax_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax_reformat\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# THIS RUNS THE UPDATED PAXTOOLS TO GENERATE BIOPAX 3 DATA FOR OUR USE.\n", + "for root, dirs, files in os.walk(data_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :\n", + " pmid = file[:-4]\n", + "\n", + " if( pmid in pmids ): \n", + " cmds = [\"java\",\"-jar\",paxtools_jar,\"toLevel3\",root+'/'+file,biopax_dir+'/'+pmid+'_biopax.xml','-psimiToComplexes']\n", + " print \" \".join(cmds)\n", + " call(cmds)\n", + " print \"\\tDONE\"" + ] + }, + { + "cell_type": "code", + "execution_count": 51, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def reformat_figure_legend_annotations(input):\n", + " \n", + " with open(input, 'r') as input_file:\n", + " xml = input_file.read()\n", + " \n", + " if \">Figure:\" not in xml: \n", + " return \n", + "\n", + " patt_hash = build_figure_extraction_patterns()\n", + " fig_patt = re.compile(\">Figure:(.*?)<\")\n", + "\n", + " output = \"\"\n", + " with open(input, 'r') as input_file:\n", + " for line in input_file.readlines(): \n", + " match = re.search(fig_patt, line)\n", + " if match: \n", + " fig_text = match.group(1).lower()\n", + " new_fig_text = run_matcher(fig_text, patt_hash)\n", + " if( new_fig_text is not None ):\n", + " line = re.sub(fig_patt,\">Figure:\"+new_fig_text+\"' + new_fig_text\n", + "\n", + " output += line\n", + "\n", + " return output\n", + "\n", + "# THIS FORMATS FIGURE ANNOTATIONS IN THE UPDATED BIOPAX 3 FILES.\n", + "for root, dirs, files in os.walk(biopax_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :\n", + " # Now, load each BIOPAX 3 file, and run the patterns on text found in the XML\n", + " reformatted_text = reformat_figure_legend_annotations(root+'/'+file)\n", + "\n", + " if reformatted_text is not None:\n", + " with open(new_biopax_dir+'/'+file, 'w') as output_file:\n", + " output_file.write(reformatted_text)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code to find which pmids have intact records " + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from shutil import copyfile\n", + "\n", + "def copy_figure_files(intactFile, figAssigmentDir, outDir):\n", + "\n", + " frames = []\n", + " \n", + " intact_tsv = pd.read_csv(intactFile, sep='\\t')\n", + " \n", + " fries_sentences = []\n", + " fries_hits = []\n", + " fries_events = []\n", + " count = 0\n", + " fries_count = 0\n", + " hit_count = 0\n", + " miss_count = 0\n", + " for i,row in intact_tsv.iterrows():\n", + " pmid = str(row['pmid'])\n", + " fig = str(row['fig'])\n", + " src_file = figAssigmentDir+'/'+pmid+'_'+fig+'.txt'\n", + " dst_file = outDir+'/'+pmid+'_'+fig+'.txt'\n", + " if( os.path.isfile(figAssigmentDir + '/'+pmid+'_'+fig+'.txt') ) :\n", + " copyfile(src_file, dst_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig_sentences_dir = stem + 'fig_sentences'\n", + "out_sentences_dir = stem + 'fig_sentences_in_intact'\n", + "\n", + "for root, dirs, files in os.walk(simple_intact_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.tsv' :\n", + " copy_figure_files(root+'/'+file, fig_sentences_dir, out_sentences_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code to link the intact files to the sciDt data.\n", + "\n", + "This is derived from the simplified TSV-format generated above. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def link_scidt_to_intact(intactFile, scidtDir, outFile):\n", + "\n", + " frames = []\n", + " \n", + " intact_tsv = pd.read_csv(intactFile, sep='\\t')\n", + " \n", + " fries_sentences = []\n", + " fries_hits = []\n", + " fries_events = []\n", + " count = 0\n", + " fries_count = 0\n", + " hit_count = 0\n", + " miss_count = 0\n", + " for i,row in intact_tsv.iterrows():\n", + " pmid = row['pmid']\n", + " print(pmid)\n", + " intact_fig = row['fig']\n", + " p1 = row['p1_xref']\n", + " p2 = row['p2_xref']\n", + " p3 = row['p3_xref']\n", + "\n", + " fries_events_local = []\n", + " \n", + " # find the figure numbers in the paper designation \n", + " scidt_path = os.path.join(scidtDir, str(pmid) + \".tsv\")\n", + " if( os.path.isfile( scidt_path ) ):\n", + " scidt_tsv = pd.read_csv(scidt_path, sep='\\t')\n", + " for i2,row2 in scidt_tsv.iterrows():\n", + " fries_sentence = row2['friesSentenceId'] \n", + " fries_event = row2['friesEventsTypes'] \n", + " scidt_figs = row2['Figure Assignment']\n", + " if( scidt_figs == scidt_figs and fries_event == fries_event):\n", + " for scidt_fig in scidt_figs.split('|'):\n", + " if scidt_fig == intact_fig and 'complex-assembly' in fries_event:\n", + " fries_count += 1\n", + " if( p1 != p1 or p2 != p2 or p3 != p3):\n", + " hit = \"MISS\"\n", + " miss_count += 1\n", + " elif( (p1 == '-' or p1 in fries_event) and \n", + " (p2 == '-' or p2 in fries_event) and \n", + " (p3 == '-' or p3 in fries_event) ):\n", + " hit = \"HIT\"\n", + " hit_count += 1\n", + " else :\n", + " hit = \"MISS\"\n", + " miss_count += 1\n", + " fries_events_local.append(fries_event + '[' + hit + ']')\n", + " \n", + " fries_events.append(fries_events_local)\n", + " \n", + " intact_tsv['fries_events'] = pd.Series(fries_events)\n", + " \n", + " intact_tsv.to_csv(outFile, sep='\\t')\n", + " print (\"COUNT: %d\" % fries_count)\n", + " print (\"HITS: %d\" % hit_count)\n", + " print (\"MISSES: %d\" % miss_count )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Run through Biopax entries. Load each file and search for evidence. Link that evidence to sentences via figure legends." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tsv_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4\"\n", + "new_biopax_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax_reformat\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "9\n", + "1\n", + "2\n", + "1\n", + "10\n", + "5\n", + "4\n", + "5\n", + "1\n", + "9\n", + "5\n", + "18\n", + "1\n", + "6\n", + "13\n", + "9\n", + "21\n", + "215\n", + "4\n", + "5\n", + "6\n", + "5\n", + "6\n", + "2\n", + "2\n", + "3\n", + "2\n", + "9\n", + "6\n", + "5\n", + "9\n", + "4\n", + "1\n", + "3\n", + "4\n", + "1\n", + "2\n", + "15\n", + "8\n", + "14\n", + "13\n", + "6\n", + "3\n", + "1\n", + "4\n", + "4\n", + "29\n", + "11\n", + "24\n", + "14\n", + "5\n", + "2\n", + "11\n", + "7\n", + "14\n", + "2\n", + "6\n", + "5\n", + "3\n", + "2\n", + "6\n", + "5\n", + "1\n", + "5\n", + "3\n", + "1\n", + "12\n", + "5\n", + "6\n", + "1\n", + "7\n", + "13\n", + "2\n", + "6\n", + "2\n", + "6\n", + "8\n", + "4\n", + "1" + ] + } + ], + "source": [ + "import uuid\n", + "import pandas as pd\n", + "import json\n", + "\n", + "def generate_annotation_page(pmid, biopax_path, scidt_path):\n", + " annotation_items = []\n", + " annotation_page = {\n", + " \"@context\": \"http://www.w3.org/ns/anno.jsonld\",\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotation_page/\"+pmid,\n", + " \"type\": \"AnnotationPage\",\n", + " \"partOf\": {\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotations\"\n", + " },\n", + " \"next\": \"http://example.org/page2\",\n", + " \"startIndex\": 0,\n", + " \"items\": annotation_items\n", + " }\n", + " \n", + " biopax_lines = []\n", + " with open(biopax_path, 'r') as biopax_file:\n", + " biopax_lines = biopax_file.readlines()\n", + "\n", + " scidt_tsv = pd.read_csv(scidt_path, sep='\\t')\n", + " \n", + " we_are_on = False\n", + " evidence_patt = re.compile(\"\")\n", + " figure_patt = re.compile(\">Figure:(.*?)<\")\n", + " evidence_off_patt = re.compile(\"<\\/bp:Evidence>\")\n", + "\n", + " evidence_code = ''\n", + " figure_code = ''\n", + " for biopax_line in biopax_lines: \n", + " evidence_match = re.search(evidence_patt, biopax_line)\n", + " if evidence_match: \n", + " evidence_code = evidence_match.group(1)\n", + " figure_code = ''\n", + " we_are_on = True\n", + "\n", + " figure_match = re.search(figure_patt, biopax_line)\n", + " if figure_match: \n", + " figure_code = figure_match.group(1)\n", + " \n", + " if we_are_on and len(figure_code)>0:\n", + " \n", + " targets = []\n", + " annotation = {\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotations/\"+pmid+'#'+str(count),\n", + " \"type\": \"Annotation\",\n", + " \"body\": {\n", + " \"id\": evidence_code,\n", + " \"type\": \"Dataset\"\n", + " },\n", + " \"target\": targets\n", + " }\n", + " annotation_items.append(annotation)\n", + " \n", + " for i, row in scidt_tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " text = row['Sentence Text']\n", + " codeStr = row['Codes']\n", + " expts = row['ExperimentValues']\n", + " paragraph = row['Paragraph']\n", + " heading = row['Headings']\n", + " discourse = row['Discourse Type']\n", + " offset_start = row['Offset_Begin']\n", + " offset_end = row['Offset_End']\n", + " fig = row['Figure Assignment']\n", + " \n", + " if(fig != fig):\n", + " continue\n", + " \n", + " for f in re.split(\"|\", fig):\n", + " if( f in figure_code):\n", + " \n", + " targets.append({\n", + " \"source\": \"https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/\" + str(pmid),\n", + " \"selector\": [{\n", + " \"type\": \"TextQuoteSelector\",\n", + " \"exact\": text\n", + " },\n", + " {\n", + " \"type\": \"TextPositionSelector\",\n", + " \"start\": offset_start,\n", + " \"end\": offset_end\n", + " }]\n", + " })\n", + " \n", + " annotation['target'] = targets\n", + " we_are_on = False\n", + " \n", + " #print len(annotation_items)\n", + " annotation_page['items'] = annotation_items\n", + " \n", + " return annotation_page\n", + "\n", + "annotation_collection_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/annotation_collection.json\"\n", + "annotation_pages_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pages\"\n", + "\n", + "page = {}\n", + "annotation_collection = {\n", + " \"@context\": \"http://www.w3.org/ns/anno.jsonld\",\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotations\",\n", + " \"type\": \"AnnotationCollection\",\n", + " \"label\": \"Anntoations linking BioPax records from the INTACT database to text fragments describing evidence\",\n", + " \"total\": 0,\n", + " \"first\": page\n", + "}\n", + "\n", + "count = 0\n", + "annotation_pages = {}\n", + "last_annotation_page = None\n", + "for root, dirs, files in os.walk(new_biopax_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :\n", + " l = len('_biopax.xml')\n", + " pmid = file[:-l]\n", + " tsv_file = tsv_dir+'/'+str(pmid)+'.tsv'\n", + " \n", + " if not os.path.isfile(tsv_file):\n", + " continue\n", + " \n", + " annotation_page = generate_annotation_page(pmid, root+'/'+file, tsv_dir+'/'+str(pmid)+'.tsv')\n", + " count += 1\n", + " #print json.dumps(annotation_page, sort_keys=True, indent=4, separators=(',', ': '))\n", + " \n", + " if(last_annotation_page is None):\n", + " annotation_collection['first'] = annotation_page['id']\n", + " else:\n", + " last_annotation_page['next'] = annotation_page['id']\n", + " \n", + " annotation_page_dump = json.dumps(annotation_page, sort_keys=True, indent=4, separators=(',', ': '))\n", + " with open(annotation_pages_path+'/page_'+pmid+'.json', 'w') as annotation_page_file:\n", + " annotation_page_file.write(annotation_page_dump)\n", + " \n", + " last_annotation_page = annotation_page\n", + " \n", + "annotation_collection['total'] = count\n", + "annotation_collection_dump = json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))\n", + "with open(annotation_collection_path, 'w') as annotation_collection_file:\n", + " annotation_collection_file.write(annotation_collection_dump)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "annotation_collection_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/annotation_collection.json\"\n", + "annotation_pages_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pages\"\n", + "\n", + "annotation_collection_dump = json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))\n", + "with open(annotation_collection_path, 'w') as annotation_collection_file:\n", + " annotation_collection_file.write(annotation_collection_dump)\n", + "\n", + "for pmid in annotation_pages.keys():\n", + " page = annotation_pages[pmid]\n", + " annotation_page_dump = json.dumps(page, sort_keys=True, indent=4, separators=(',', ': '))\n", + " with open(annotation_pages_path+'/page_'+pmid+'.json', 'w') as annotation_page_file:\n", + " annotation_page_file.write(annotation_page_dump)\n", + " \n", + "print json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/.ipynb_checkpoints/word-movers-distance-in-python-checkpoint.ipynb b/notebooks/.ipynb_checkpoints/word-movers-distance-in-python-checkpoint.ipynb new file mode 100644 index 0000000..50a2b06 --- /dev/null +++ b/notebooks/.ipynb_checkpoints/word-movers-distance-in-python-checkpoint.ipynb @@ -0,0 +1,1358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word mover's distance classification in Python\n", + "\n", + "*A guide to scikit-learn compatible nearest neighbors classification using the recently introduced word mover's distance (WMD). *\n", + "Joint post with the awesome [Matt Kusner](http://matthewkusner.com)!\n", + "\n", + "[Source of this Jupyter notebook.](http://nbviewer.jupyter.org/github/vene/vene.github.io/blob/pelican/content/blog/word-movers-distance-in-python.ipynb)\n", + "\n", + "In document classification and other natural language processing applications, having a good measure of the similarity of two texts can be a valuable building block. Ideally, such a measure would capture semantic information. Cosine similarity on bag-of-words vectors is known to do well in practice, but it inherently cannot capture when documents say the same thing in completely different words.\n", + "\n", + "Take, for example, two headlines:\n", + "\n", + " * *Obama speaks to the media in Illinois*\n", + " * *The President greets the press in Chicago*\n", + "\n", + "These have no content words in common, so according to most bag of words--based metrics, their distance would be maximal. (For such applications, you probably don't want to count stopwords such as *the* and *in*, which don't truly signal semantic similarity.)\n", + "\n", + "One way out of this conundrum is the word mover's distance (WMD), introduced in \n", + "[*From Word Embeddings To Document Distances*](http://mkusner.github.io/publications/WMD.pdf),\n", + "(Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, ICML 2015).\n", + "WMD adapts the [earth mover's distance](https://en.wikipedia.org/wiki/Earth_mover%27s_distance) to the space of documents: the distance between two texts is given by the total amount of \"mass\" needed to move the words from one side into the other, multiplied by the distance the words need to move. So, starting from a measure of the distance between different words, we can get a principled document-level distance. Here is a visualisation of the idea, from the ICML slides:\n", + "\n", + "![WMD example from Matt's slides](https://vene.ro/images/wmd-obama.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare some word embeddings\n", + "\n", + "The key ingredient in WMD is a good distance measure between words. Dense representations of words, also known by the trendier name \"word embeddings\" (because \"distributed word representations\" didn't stick), do the trick here. We could train the embeddings ourselves, but for meaningful results we would need tons of documents, and that might take a while. So let's just use the ones from the [`word2vec`](https://code.google.com/p/word2vec/) team. [(download link)](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import gensim\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s6 (,hypothesis|implication) [X] : In support of distinct peroxisomal binding sites for Pex7p , the Pex7p/Pex13p and Pex7p/ Pex14p complexes can form independently. Genetic evidence for the interaction of Pex7p and Pex13p is provided by the observation that overexpression of Pex13p suppresses a loss of function mutant of Pex7p .\n", + "s8 (,implication|implication) [0] : NH2-terminal regions of Pex13p are required for its interaction with the PTS2-receptor while the COOH-terminal SH3 domain alone is sufficient to mediate its interaction with the PTS1-receptor .\n", + "s9 (,result|implication) [0] : Reinvestigation of the topology revealed both termini of Pex13p to be oriented towards the cytosol .\n", + "s10 (,result|implication) [0] : We also found Pex13p to be required for peroxisomal association of Pex14p , yet the SH3 domain of Pex13p may not provide the only binding site for Pex14p at the peroxisomal membrane .\n", + "s18 (,result|result) [0] : pex7Delta cells exhibit the reverse phenotype ( for review see exLink ) .\n", + "s19 (,implication) [0] : The intracellular localization of both targeting signal receptors is still a matter of debate .\n", + "s20 (,result|result) [0] : A predominantly cytosolic , membrane-bound , and even intraperoxisomal localization have been reported for both receptors ( for review see exLink ) .\n", + "s23 (,result|implication) [0] : There is no experimental evidence for this model , but it is consistent with the observation that peroxisomes are able to import both folded and oligomeric proteins ( for review see exLink ) .\n", + "s30 (,implication|implication) [0] : Together , these data suggest that the two import pathways are not independent but overlapping , with Pex14p as the point of convergence of the pathways at the peroxisomal membrane ( exLink ) .\n", + "s33 (,result|method) [X] : In addition , overexpression of Pex13p suppresses the protein import defect caused by HA-tagged , functionally compromised Pex7p , further suggesting an interaction between the two proteins by genetic means .\n", + "s35 (,result|result) [0] : Reinvestigation of the membrane topology of Pex13p revealed that both termini of the protein are exposed to the cytosol .\n", + "s36 (,implication) [0] : Pex13p was also required for Pex14p localization at the peroxisomal membrane .\n", + "s37 (,result) [0] : However , the peroxisomal targeting of Pex14p did not require interaction with the SH3 domain of Pex13p .\n", + "s133 (Results,result|hypothesis) [0] : It has been reported that the import receptors Pex5p and Pex7p interact with each other in the yeast two-hybrid system , which opened the possibility that both proteins may form a heteromeric cytosolic signal recognition complex ( exLink ) .\n", + "s134 (Results,result|hypothesis) [0] : However , the yeast two-hybrid system does not necessarily distinguish between a direct and indirect binding of two S. cerevisiae proteins , as endogenous proteins may contribute to the observed interaction .\n", + "s135 (Results,hypothesis|result) [X] : As Pex14p can bind both import receptors , we investigated whether the Pex5p/Pex7p interaction is still observed in a yeast two-hybrid reporter strain deleted for the genomic copy of PEX14 ( exLink ) .\n", + "s143 (Results,method|result) [0] : Pex14p and Pex13p , but not Fbp1p , pelleted , indicating the complete sedimentation of cytosol-free peroxisomal membranes ( Fig. 2 ) .\n", + "s144 (Results,result|result) [0] : As reported previously ( exLink ) , mycPex7p was predominantly found in the soluble fraction in wild-type cells , while a low but significant amount was detected in the membrane fraction .\n", + "s145 (Results,result|result) [0] : A decrease of mycPex7p in the pellet fraction of pex14Delta cells ( Fig. 2 ) suggests that the majority of sedimentable Pex7p associates with membranes in a Pex14p-dependent manner .\n", + "s146 (Results,result|implication) [X] : However , in pex14Delta cells a significant amount of mycPex7p was detected in the membrane pellet fraction ( Fig. 2 ) , indicating that next to Pex14p additional binding factors for Pex7p exist at the peroxisomal membrane .\n", + "s147 (Results,result) [0] : Coimmunoprecipitation of Pex13p and Pex7p in the Absence of Pex14p and Pex5p\n", + "s149 (Results,result|result) [0] : As reported previously , we found Pex5p , Pex13p , Pex14p , and Pex17p associated with mycPex7p when the receptor was precipitated from wild-type or complemented pex7Delta cells ( exLink ; exLink ) .\n", + "s150 (Results,result) [0] : Comparison of the constituents of the precipitates revealed five interesting observations .\n", + "s151 (Results,result|implication) [0] : First , in pex14Delta and pex5Delta/pex14Delta strains , Pex13p still coimmunoprecipitated with mycPex7p ( Fig. 3 ) , suggesting that Pex13p associates directly or indirectly with Pex7p .\n", + "s152 (Results,result|implication) [0] : Moreover , this result indicated that neither Pex14p nor Pex5p is required for the formation of this subcomplex of Pex13p and Pex7p .\n", + "s153 (Results,result|result) [X] : Second , the amount of Pex5p in the precipitate from pex14Delta cells was drastically reduced , while the amount in Pex13p remained essentially unchanged ( Fig. 3 , lane pex14Delta ) .\n", + "s154 (Results,implication|implication) [X] : This result supports the notion that the amount of Pex5p bound to Pex13p does not determine the stoichiometry of the Pex13p-Pex7p subcomplex .\n", + "s155 (Results,implication|implication) [0] : However , it also suggests that Pex13p may not bind both import receptors equally at the same time .\n", + "s156 (Results,result) [0] : Third , Pex13p , Pex14p , and Pex5p still coimmunoprecipitated with Pex7p in pex17Delta cells ( Fig. 3 , lane pex17Delta ) .\n", + "s157 (Results,fact|implication) [0] : Obviously , Pex7p is associated with components of the peroxisomal translocation machinery in the absence of Pex17p , suggesting that the presence of Pex17p is not a prerequisite for docking of Pex7p to the peroxisomal membrane .\n", + "s158 (Results,result|implication) [0] : Fourth , the lack of Pex17p in the coimmunoprecipitate from pex14Delta cells ( Fig. 3 , lane pex14Delta ) , suggests that Pex14p is required for the association of Pex17p with the complex , and is consistent with the assumption that Pex17p binding to the complex may be via Pex14p .\n", + "s159 (Results,implication|result) [0] : However , this observation must be interpreted with care since the pex14Delta cells contain much less immunologically detectable Pex17p ( exLink ) .\n", + "s160 (Results,result|result) [X] : Finally , the amount of Fox3p that coimmunoprecipitates with Pex7p drastically increases in mutants with an import defect for PTS2 proteins ( pex17Delta , pex13Delta , pex14Delta , and pex5Delta/pex14Delta ) relative to the strains unaffected in this pathway ( wild-type and pex5Delta ) .\n", + "s161 (Results,result|result) [X] : Since the total amount of both proteins is similar in all strains ( Fig. 3 B ) , it seems unlikely that the observed Pex7p/Fox3p complex has formed in vitro after cell disruption .\n", + "s162 (Results,hypothesis|implication) [0] : A simple explanation for this may be that the high cytosolic concentration of thiolase in the import mutants results in greater occupation of the PTS2 receptor .\n", + "s164 (Results,result|result) [0] : These proteins were not detected in any of the samples , indicating the specificity of the observed interactions ( data not shown ) .\n", + "s166 (Results,result|goal) [0] : The observed in vivo association of Pex7p with Pex13p in cells lacking Pex14p and Pex5p encouraged us to analyze the interaction of these proteins in more detail .\n", + "s169 (Results,result|result) [0] : The results shown in Fig. 4 A reveal that the full length Pex13p is indeed able to interact with the PTS2-receptor Pex7p .\n", + "s170 (Results,result|result) [0] : The controls included show that coexpression of either of the fusion proteins alone did not support transcription activation of the reporter genes .\n", + "s173 (Results,result|result) [X] : Because Pex13pE320K lost the ability to interact with Pex14p in the yeast two-hybrid system ( Fig. 4 B , see also Fig. 8 ) , this experiment was expected to monitor the Pex13p/Pex7p interaction upon simultaneous elimination of the Pex14p and Pex5p influence .\n", + "s174 (Results,result|result) [0] : As shown in Fig. 4 , these two-hybrid analyses did not reveal an influence of Pex5p or Pex14p on the Pex13p/ Pex7p interaction .\n", + "s175 (Results,result|result) [X] : No difference was observed independent of whether the Pex7p/Pex13p interaction was analyzed in wild-type , pex5Delta , or pex14Delta strains ( Fig. 4 A ) , or for the Pex7p/Pex13pE320K interaction in pex5Delta cells ( Fig. 4 B ) .\n", + "s176 (Results,implication|implication) [0] : These results indicate that neither Pex14p nor Pex5p is required for the in vivo interaction of Pex7p with Pex13p , and therefore are in agreement with results obtained in the coimmunoprecipitation experiment ( Fig. 3 ) .\n", + "s177 (Results,result|implication) [0] : The two-hybrid interaction of the complete Pex13p with Pex14p is only detected by histidine prototrophy ( Fig. 4 B ) , indicating that regions NH2-terminal of the SH3 domain of Pex13p may weaken the interaction of these proteins in the two-hybrid system .\n", + "s179 (Results,fact|result) [0] : Mutant cells lacking Pex7p are characterized by their inability to grow on oleic acid as the sole carbon source ( Fig. 5 A ) and by mislocalization of peroxisomal thiolase to the cytosol ( exLink ; exLink ) .\n", + "s180 (Results,result) [X] : Expression of a COOH-terminally HA-tagged Pex7p from the low copy plasmid pRSPEX7-HA3 leads only to a partial complementation of the pex7Delta mutant phenotype ( exLink ) .\n", + "s181 (Results,implication|result) [0] : This is indicated by the inability of the transformants to grow on oleic acid plates ( Fig. 5 A ) and a reduced ability to import Fox3p ( thiolase ) into peroxisomes .\n", + "s182 (Results,result) [0] : The latter is evident by the pronounced cytosolic mislocalization of this protein ( Fig. 5 B , panel d ) .\n", + "s183 (Results,result|goal) [X] : This mutant phenotype of pex7Delta [ pRSPEX7-HA3 ] was employed to investigate whether overexpression of Pex7p-binding partners may suppress a defect in Pex7p function .\n", + "s185 (Results,result|result) [X] : As judged by their growth characteristics on oleic acid medium ( Fig. 5 A ) and by the fluorescence pattern for thiolase ( Fig. 5 B ) , overexpression of PEX13 , but not PEX14 , rescued the mutant phenotype caused by the defective Pex7p-HA .\n", + "s186 (Results,result|implication) [X] : Even though the suppression was not as efficient as complementation with the wild-type PEX7 , this observation demonstrates that Pex13p can suppress the mutant phenotype of pex7Delta [ pRSPEX7-HA3 ] , providing genetic evidence for an interaction between Pex7p and Pex13p .\n", + "s191 (Results,result|result) [0] : The tag has been shown previously not to affect the function of Pex13p ( exLink ) .\n", + "s193 (Results,result|result) [X] : As judged by immunoblot analysis , both the NH2-terminal myc-tag as well as the SH3 domain of Pex13p were rapidly degraded by the protease ( Fig. 6 ) .\n", + "s194 (Results,result) [0] : Intraperoxisomal thiolase remained stable under these conditions and was only degraded in the presence of detergents ( data not shown ) .\n", + "s195 (Results,implication|implication) [0] : From this data , we conclude that both the NH2 terminus and the COOH-terminal SH3 domain are exposed to the cytosol .\n", + "s196 (Results,implication) [0] : This result also implicates the presence of an even number of transmembrane spans within Pex13p .\n", + "s203 (Results,method|implication) [0] : This observation suggests that Pex17p is not required for the targeting of Pex14p to the peroxisomal membrane .\n", + "s204 (Results,implication) [0] : In contrast , no congruent fluorescence patterns were observed in pex13Delta cells .\n", + "s205 (Results,result|fact) [0] : Since the HA-tagged Pex11p is known to be targeted to peroxisomal membrane ghosts in pex13Delta cells ( exLink ) , the lack of congruence suggests that the majority of Pex14p is mislocalized .\n", + "s206 (Results,implication|goal) [0] : To confirm this result by independent means , we performed a flotation of wild-type , pex13Delta , and pex17Delta homogenates in sucrose gradients ( Fig. 7 B ) .\n", + "s209 (Results,fact|result) [0] : However , Pex14p was not detected in these fractions , but was found to cosegregate with mitochondrial fumarase .\n", + "s210 (Results,fact|implication) [0] : These data suggest that the peroxisomal membrane ghosts in pex13Delta cells lack Pex14p .\n", + "s211 (Results,implication) [0] : Thus , the presence of Pex13p is a prerequisite for peroxisomal membrane association of Pex14p .\n", + "s212 (Results,implication|hypothesis) [0] : Pex13p could be involved in targeting , or it could be required for binding or retention of Pex14p at the peroxisome .\n", + "s217 (Results,result) [0] : Remarkably , the mutated Pex14pAXXA still complemented the peroxisome biogenesis defect of pex14Delta cells ( data not shown ) .\n", + "s219 (Results,result|result) [0] : This mutation has been reported to result in the inactivation of Pex13p function ( exLink ) .\n", + "s220 (Results,result|result) [0] : As shown in Fig. 8 , the mutated Pex14pAXXA had lost the ability to bind Pex13p in the yeast two-hybrid system while binding to Pex5p , Pex7p , and oligomerization of the protein was unchanged .\n", + "s221 (Results,result) [0] : Also the E320K mutation of Pex13p abolished the two-hybrid interaction of the SH3 domain of Pex13p with Pex14p ( Fig. 8 ) .\n", + "s222 (Results,implication|implication) [0] : These results suggest that strong interactions between Pex14p and the SH3 domain of Pex13p are dependent on the PXXP motif within Pex14p , as well as on the RT loop of the SH3 domain of Pex13p .\n", + "s223 (Results,method|result) [0] : Next , we analyzed the Pex14pAXXA ( Fig. 9 A ) association with peroxisomal membrane ghosts of pex14Delta/pex17Delta double mutants which were predicted to contain peroxisomal membrane ghosts even upon complementation of the pex14Delta mutation .\n", + "s226 (Results,result) [0] : Colocalization was observed for HA-Pex11p and Pex14pAXXA in pex14Delta cells , as well as for HA-Pex11p and Pex14p in pex13Delta cells expressing Pex13pE320K , indicative of peroxisomal membrane association of these proteins ( Fig. 9 A ) .\n", + "s227 (Results,result|result) [0] : These results were corroborated by flotation analysis which revealed that Pex14pAXXA was associated with the fraction containing the peroxisomal membrane ghosts of pex14Delta/pex17Delta , as were Pex14p in pex13Delta/pex17Delta cells expressing Pex13pE320K ( Fig. 9 B ) .\n", + "s228 (Results,implication|implication) [0] : These observations suggest that Pex14p is associated with peroxisomes and peroxisomal membrane ghosts independent of interaction between the proline-rich motif of Pex14p and the RT loop in the SH3 domain of Pex13p .\n", + "s229 (Results,result|implication) [0] : Interestingly , the fractionation of pex13Delta/ pex17Delta [ PEX13E320K ] shows that , although the RT loop of the SH3 domain of Pex13p is not absolutely required for the targeting of Pex14p to the membrane of peroxisomal ghosts , it appears to enhance or stabilize the targeting , as only Pex14p trails through the gradients of this mutant strain ( Fig. 9 B ) .\n", + "s230 (Discussion,implication) [0] : Discussion\n", + "s231 (Discussion,implication|implication) [0] : The peroxisomal membrane protein Pex14p has been reported to bind both the PTS1 and the PTS2 receptor , which led exLink to the conclusion that Pex14p may represent the point of convergence of the PTS1 - and PTS2-dependent protein import pathways at the peroxisomal membrane .\n", + "s234 (Discussion,implication|implication) [0] : Pex13p is also shown to be required for the peroxisomal association of Pex14p ; however , evidence is provided that the SH3 domain of Pex13p may not represent the only binding site for Pex14p at the peroxisomal membrane .\n", + "s236 (Discussion,result|fact) [0] : The SH3 domain of Pex13p has been reported to interact with the PTS1 receptor Pex5p and with Pex14p ( exLink ; exLink ; exLink ; exLink ; exLink ; Fig. 8 ) .\n", + "s237 (Discussion,fact|result) [X] : A mutation in the RT loop of the SH3 domain of Pex13p , as well as a mutation of a putative class II SH3 ligand motif of Pex14p abolished the two-hybrid interaction of both proteins ( Fig. 8 ) , supporting the notion of a typical SH3 domain-ligand interaction between Pex13p and Pex14p .\n", + "s238 (Discussion,result|result) [0] : Interestingly , although the E320K mutation of the RT loop of the SH3 domain of Pex13p abolishes its two-hybrid interaction with Pex14p , the mutated SH3 domain still interacts with Pex5p ( Fig. 8 B ) .\n", + "s239 (Discussion,implication|implication) [0] : Accordingly , we conclude that there are distinct binding sites for both Pex5p and Pex14p within this domain or adjacent regions contained within the construct used for the assay .\n", + "s240 (Discussion,result) [X] : Remarkably , neither the E320K mutation of the SH3 domain of Pex13p nor the mutation of the proline-rich motif of Pex14p prevented the peroxisomal localization of Pex14p ( Fig. 9 ) .\n", + "s241 (Discussion,implication|implication) [0] : This observation suggests that the binding of Pex14p to the SH3 domain of Pex13p is not absolutely required for the targeting and binding of Pex14p to peroxisomes .\n", + "s242 (Discussion,result) [0] : Why then does the absence of Pex13p lead to the mistargeting of Pex14p ( Fig. 7 ) ?\n", + "s247 (Discussion,hypothesis|implication) [0] : It is true that Pex17p is another binding partner of Pex14p , but our data suggest that Pex17p is not required for association of the Pex13p/ Pex14p/Pex5p/Pex7p complex , as all these components can efficiently coprecipitate in the absence of Pex17p ( Fig. 3 ) .\n", + "s248 (Discussion,result|result) [0] : Moreover , we found no Pex17p in a precipitate from pex14Delta cells that still contains Pex13p and Pex7p ( Fig. 3 ) , leading to two conclusions .\n", + "s252 (Discussion,result|implication) [X] : The amount of Pex7p in the membrane sediment of pex14Delta cells is significantly lower than in wild-type or pex13Delta cells ( Fig. 2 ) , suggesting that Pex14p may contribute to the majority of the total binding capacity of the peroxisomal membrane for the PTS2 receptor .\n", + "s253 (Discussion,result) [X] : However , a significant amount of Pex7p was sedimented in the absence of Pex14p ( Fig. 2 , lane pex14Delta ) .\n", + "s254 (Discussion,result|implication) [X] : Interestingly , in cells lacking both Pex13p and Pex14p , no Pex7p was found in the membrane pellet , which suggests that Pex13p contributed to the remaining Pex7p associated with peroxisomal membranes of pex14Delta cells ( data not shown ) .\n", + "s255 (Discussion,implication|result) [0] : This result , however , has to be interpreted with care since the double deletion of PEX13 and PEX14 did result in a significant decrease in immunologically detectable Pex7p ( Girzalsky , W. , and R. Erdmann , unpublished observations ) .\n", + "s256 (Discussion,result|result) [0] : The observations that Pex13p and Pex7p interact in the two-hybrid system and can be efficiently coimmunoprecipitated indicate that the proteins interact in vivo ( Figs. 3 and 4 ) .\n", + "s257 (Discussion,implication|result) [X] : Whether Pex13p directly binds Pex7p remains to be shown .\n", + "s258 (Discussion,method|result) [0] : Attempts to demonstrate direct binding of the proteins by coimmunoprecipitation of in vitro translated proteins were unsuccessful ( data not shown ) .\n", + "s260 (Discussion,implication|implication) [0] : However , two observations indicate that the hypothetical bridging protein is not one of the known binding partners for Pex13p .\n", + "s261 (Discussion,result|result) [X] : First , the Pex7p/Pex13p interaction is also observed in the absence of these proteins ( Figs. 3 and 4 ) , and second , the COOH-terminal SH3 domain alone is sufficient for the Pex13p/ Pex14p and Pex13p/Pex5p two-hybrid interaction , but not for the interaction of Pex13p with Pex7p ( exLink ) .\n", + "s262 (Discussion,implication) [X] : A direct interaction of Pex13p and Pex7p is further suggested by the genetic suppression of the defect caused by a functionally compromised HA-tagged Pex7p by overexpression of Pex13p ( Fig. 5 ) .\n", + "s263 (Discussion,implication|result) [0] : As discussed above , a Pex5p/Pex7p two-hybrid interaction is not observed in pex14Delta ( Fig. 1 ) .\n", + "s264 (Discussion,implication|result) [X] : At first , this observation seems rather surprising , since both Pex5p and Pex7p independently interact with Pex13p in the two-hybrid system ( Fig. 4 ) .\n", + "s267 (Discussion,implication) [X] : In support of this assumption , the amount of Pex5p coimmunoprecipitating with Pex7p in the absence of Pex14p is extremely reduced , despite the presence of significant amounts of Pex13p ( Fig. 3 , lane pex14Delta ) .\n", + "s268 (Discussion,implication|result) [0] : Perhaps Pex13p does not usually associate simultaneously with both of the import receptors , or association is transient .\n", + "s271 (Discussion,result|result) [0] : One group has reported that the protein is exclusively localized in the peroxisomal lumen ( exLink , exLink ) , whereas others found the protein to be predominantly localized in the cytosol with a small amount associated with the peroxisomal membrane ( exLink ; exLink ) .\n", + "s272 (Discussion,implication|hypothesis) [0] : Because the SH3 domain alone does not mediate the interaction with Pex7p , we suggest that regions NH2-terminal of the SH3 domain may be required for the interaction or contribute to the correct conformation of the binding site .\n", + "s273 (Discussion,result|result) [0] : Previously , the COOH-terminal SH3 domain has been reported to face the cytosol ( exLink ; exLink ) , and we found that both the NH2 terminus and the COOH terminus of Pex13p are exposed to the cytosol ( Fig. 6 ) , suggesting that the protein traverses the membrane with an even number of membrane spans .\n", + "s274 (Discussion,implication|hypothesis) [0] : In this respect , it is interesting to note that two regions which would fulfill the requirement for alpha-helical transmembrane segments are present in Pex13p ( exLink ) .\n", + "s275 (Discussion,fact|implication) [0] : The interaction of Pex13p with Pex7p has far reaching implications for our understanding of protein import into the peroxisomal matrix .\n", + "\n", + "105/279\n", + "25/39\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "inFile = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4/10087260.tsv'\n", + "#/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/pathwayLogic/scidt_bioc_sentences_tsv/11777939.tsv'\n", + "tsv = pd.read_csv(inFile, sep='\\t')\n", + "sentences = []\n", + "\n", + "stopwords = stopwords.words('english')\n", + "regex1 = re.compile(r\"[\\(\\)\\{\\}\\[\\]\\;\\.\\'\\\"\\,\\/\\_\\*]\", re.IGNORECASE)\n", + "regex2 = re.compile(r\"\\s+\", re.IGNORECASE)\n", + "\n", + "allHits = 0\n", + "hits = 0\n", + "j = 0\n", + "for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + " \n", + " j += 1\n", + " if(reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if ('methods' in heading.lower()):\n", + " continue\n", + " \n", + " r = 'X'\n", + " if(reachData != reachData):\n", + " r = '0'\n", + " \n", + " if(reachData == reachData):\n", + " hits += 1\n", + "\n", + " print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text ) \n", + " \n", + " text = re.sub(regex1,\"\",text)\n", + " sent = regex2.split(text)\n", + " sent = [w for w in sent if w not in stopwords and len(w)>0]\n", + " sentences.append(sent)\n", + "\n", + " if 'exLink' in codeStr:\n", + " continue\n", + "\n", + " \n", + "print\n", + "print (str(len(sentences)) + '/' + str(j))\n", + "print (str(hits) + '/' + str(allHits))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "wv = gensim.models.KeyedVectors.load_word2vec_format(\n", + " \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/embeddings_pubmed_files/PMC-w2v.bin\",\n", + " binary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models.word2vec import Word2Vec\n", + "\n", + "model = Word2Vec(iter=1) \n", + "model.wv = wv" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of sentences = 105\n", + "\n", + "........................................................................................................\n", + ".......................................................................................................\n", + "......................................................................................................\n", + ".....................................................................................................\n", + "....................................................................................................\n", + "...................................................................................................\n", + "..................................................................................................\n", + ".................................................................................................\n", + "................................................................................................\n", + "...............................................................................................\n", + "..............................................................................................\n", + ".............................................................................................\n", + "............................................................................................\n", + "...........................................................................................\n", + "..........................................................................................\n", + ".........................................................................................\n", + "........................................................................................\n", + ".......................................................................................\n", + "......................................................................................\n", + ".....................................................................................\n", + "....................................................................................\n", + "...................................................................................\n", + "..................................................................................\n", + ".................................................................................\n", + "................................................................................\n", + "...............................................................................\n", + "..............................................................................\n", + ".............................................................................\n", + "............................................................................\n", + "...........................................................................\n", + "..........................................................................\n", + ".........................................................................\n", + "........................................................................\n", + ".......................................................................\n", + "......................................................................\n", + ".....................................................................\n", + "....................................................................\n", + "...................................................................\n", + "..................................................................\n", + ".................................................................\n", + "................................................................\n", + "...............................................................\n", + "..............................................................\n", + ".............................................................\n", + "............................................................\n", + "...........................................................\n", + "..........................................................\n", + ".........................................................\n", + "........................................................\n", + ".......................................................\n", + "......................................................\n", + ".....................................................\n", + "....................................................\n", + "...................................................\n", + "..................................................\n", + ".................................................\n", + "................................................\n", + "...............................................\n", + "..............................................\n", + ".............................................\n", + "............................................\n", + "...........................................\n", + "..........................................\n", + ".........................................\n", + "........................................\n", + ".......................................\n", + "......................................\n", + ".....................................\n", + "....................................\n", + "...................................\n", + "..................................\n", + ".................................\n", + "................................\n", + "...............................\n", + "..............................\n", + ".............................\n", + "............................\n", + "...........................\n", + "..........................\n", + ".........................\n", + "........................\n", + ".......................\n", + "......................\n", + ".....................\n", + "....................\n", + "...................\n", + "..................\n", + ".................\n", + "................\n", + "...............\n", + "..............\n", + ".............\n", + "............\n", + "...........\n", + "..........\n", + ".........\n", + "........\n", + ".......\n", + "......\n", + ".....\n", + "....\n", + "...\n", + "..\n", + ".\n" + ] + } + ], + "source": [ + "import sys \n", + "print(\"Number of sentences = {:d}\".format(len(sentences))) \n", + "\n", + "dMatrix=[]\n", + "for i in range(0,len(sentences)):\n", + " row=[]\n", + " dMatrix.append(row)\n", + " sys.stdout.write('\\n')\n", + " for j in range(0,len(sentences)):\n", + " if(ij):\n", + " d = dMatrix[j][i]\n", + " dMatrix[i].append(d)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "inlinks = []\n", + "outlinks = []\n", + "j=0\n", + "for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + " \n", + " if(reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if ('methods' in heading.lower()):\n", + " continue\n", + " \n", + " if 'exLink' in codeStr:\n", + " outlinks.append(j)\n", + " else: \n", + " inlinks.append(j)\n", + " j += 1\n", + " \n", + "ii = []\n", + "io = []\n", + "oo = []\n", + "for i in range(0,len(sentences)):\n", + " for j in range(0,len(sentences)):\n", + " if( i in inlinks and j in inlinks):\n", + " ii.append(dMatrix[i][j])\n", + " elif( i in outlinks and j in outlinks):\n", + " oo.append(dMatrix[i][j])\n", + " else: \n", + " io.append(dMatrix[i][j])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL mean = 2.91, stdev = 0.71, (105, 105)\n", + "ii mean = 2.89, stdev = 0.73, (7225,)\n", + "io mean = 2.95, stdev = 0.70, (3400,)\n", + "oo mean = 2.92, stdev = 0.41, (400,)\n" + ] + } + ], + "source": [ + "a = np.array(dMatrix)\n", + "a_ii = np.array(ii)\n", + "a_oo = np.array(oo)\n", + "a_io = np.array(io)\n", + "\n", + "print(\"ALL mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a),np.std(a),a.shape))\n", + "print(\"ii mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a_ii),np.std(a_ii),a_ii.shape))\n", + "print(\"io mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a_io),np.std(a_io),a_io.shape))\n", + "print(\"oo mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a_oo),np.std(a_oo),a_oo.shape))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 2.70951278 3.28758438 2.96953717 3.17719459 2.81105121\n", + " 3.35825087 3.46992865 2.29968692 3.29470469 3.05059795 2.63823061\n", + " 2.98713756 2.84195812 3.40902733 2.99265033 2.90544037 2.73017843\n", + " 3.22730077 3.28885195 2.88110962 2.56501906 2.92114793 2.54567875\n", + " 2.7520886 3.26020055 2.69512023 2.69949859 7.02568466 3.0830886\n", + " 2.44424402 2.8323922 3.29172478 2.11977776 2.6251516 3.31446568\n", + " 2.94809315 3.23356727 2.99033033 2.99645548 3.24785576 2.50086568\n", + " 3.34242533 3.31578968 3.32333538 2.51567323 3.1679571 2.77440658\n", + " 3.24562005 2.56683298 2.54401009 2.19415479 2.24739697 3.38171277\n", + " 2.82798443 2.85669571 3.11864001 2.65150638 3.22795487 2.24072686\n", + " 2.75729774 3.07123952 2.79272931 2.66202148 2.08089359 2.70754171\n", + " 2.34900525 2.88431247 2.43004056 3.19577731 2.67694075 2.29094977\n", + " 7.33531289 2.95541549 2.73295739 3.05693131 2.9249465 3.35044991\n", + " 2.68794189 2.74050119 2.66568818 2.89679137 2.74537219 7.22883396\n", + " 2.79979104]\n" + ] + } + ], + "source": [ + "#y = np.arange(35).reshape(5,7)\n", + "print a[a_i,a_i]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0. , 2.69064874, 2.81680236, ..., 3.16540014,\n", + " 2.83107124, 2.32218492],\n", + " [ 0. , 2.70951278, 2.30072444, ..., 2.44334137,\n", + " 2.72931747, 2.69064874],\n", + " [ 0. , 2.72334771, 3.28758438, ..., 2.72521473,\n", + " 2.81680236, 2.30072444],\n", + " ..., \n", + " [ 0. , 3.0078866 , 3.05008391, ..., 3.30905467,\n", + " 2.79979104, 2.67234395],\n", + " [ 0. , 3.02760509, 2.83107124, ..., 2.6596075 ,\n", + " 2.81466543, 2.79979104],\n", + " [ 0. , 2.32218492, 2.69064874, ..., 3.04252413,\n", + " 2.67234395, 2.79979104]])" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.manifold import TSNE\n", + "\n", + "X = np.array(dMatrix)\n", + "model = TSNE(n_components=2, random_state=0)\n", + "np.set_printoptions(suppress=True)\n", + "Xlayout = model.fit_transform(X) \n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from matplotlib import offsetbox\n", + "\n", + "#----------------------------------------------------------------------\n", + "# Scale and visualize the embedding vectors\n", + "def plot_embedding(X, title=None):\n", + " x_min, x_max = np.min(X, 0), np.max(X, 0)\n", + " X = (X - x_min) / (x_max - x_min)\n", + "\n", + " plt.figure()\n", + " ax = plt.subplot(111)\n", + " for i in range(X.shape[0]):\n", + " plt.text(X[i, 0], X[i, 1], str(i),\n", + " fontdict={'weight': 'bold', 'size': 9})\n", + "\n", + " '''\n", + " if hasattr(offsetbox, 'AnnotationBbox'):\n", + " # only print thumbnails with matplotlib > 1.0\n", + " shown_images = np.array([[1., 1.]]) # just something big\n", + " for i in range(digits.data.shape[0]):\n", + " dist = np.sum((X[i] - shown_images) ** 2, 1)\n", + " if np.min(dist) < 4e-3:\n", + " # don't show points that are too close\n", + " continue\n", + " shown_images = np.r_[shown_images, [X[i]]]\n", + " imagebox = offsetbox.AnnotationBbox(\n", + " offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),\n", + " X[i])\n", + " ax.add_artist(imagebox) \n", + " '''\n", + " plt.xticks([]), plt.yticks([])\n", + " if title is not None:\n", + " plt.title(title)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'sentences' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0msentences\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m#plot_embedding(Xlayout,\"t-SNE embedding of the sentences\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m#plt.show()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'sentences' is not defined" + ] + } + ], + "source": [ + "print sentences\n", + "\n", + "#plot_embedding(Xlayout,\"t-SNE embedding of the sentences\")\n", + "#plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reproducing the demo above" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('Features:', u'addresses, chicago, illinois, media, obama, president, press, speaks')\n" + ] + } + ], + "source": [ + "d1 = \"Obama speaks to the media in Illinois\"\n", + "d2 = \"The President addresses the press in Chicago\"\n", + "\n", + "vect = CountVectorizer(stop_words=\"english\").fit([d1, d2])\n", + "print(\"Features:\", \", \".join(vect.get_feature_names()))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The two documents are completely orthogonal in terms of bag-of-words" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(array([0, 0, 1, 1, 1, 0, 0, 1]), array([1, 1, 0, 0, 0, 1, 1, 0]))\n", + "cosine(doc_1, doc_2) = 1.00\n" + ] + } + ], + "source": [ + "from scipy.spatial.distance import cosine\n", + "v_1, v_2 = vect.transform([d1, d2])\n", + "v_1 = v_1.toarray().ravel()\n", + "v_2 = v_2.toarray().ravel()\n", + "print(v_1, v_2)\n", + "print(\"cosine(doc_1, doc_2) = {:.2f}\".format(cosine(v_1, v_2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7664012231\n", + "d(addresses, speaks) = 0.33\n", + "d(addresses, chicago) = 0.06\n" + ] + } + ], + "source": [ + "from sklearn.metrics import euclidean_distances\n", + "\n", + "#W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]]\n", + "#D_ = euclidean_distances(W_)\n", + "print(\"d(addresses, speaks) = {:.2f}\".format(wv.similarity('addresses','speaks')))\n", + "print(\"d(addresses, chicago) = {:.2f}\".format(wv.similarity('addresses','chicago')))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using [``pyemd``](https://github.com/wmayner/pyemd), a Python wrapper for [Pele and Werman's implementation of the earth mover's distance](http://www.ariel.ac.il/sites/ofirpele/fastemd/)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'D_' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mv_1\u001b[0m \u001b[0;34m/=\u001b[0m \u001b[0mv_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mv_2\u001b[0m \u001b[0;34m/=\u001b[0m \u001b[0mv_2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mD_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mD_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdouble\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mD_\u001b[0m \u001b[0;34m/=\u001b[0m \u001b[0mD_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# just for comparison purposes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"d(doc_1, doc_2) = {:.2f}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0memd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv_1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv_2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mD_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'D_' is not defined" + ] + } + ], + "source": [ + "from pyemd import emd\n", + "\n", + "# pyemd needs double precision input\n", + "v_1 = v_1.astype(np.double)\n", + "v_2 = v_2.astype(np.double)\n", + "v_1 /= v_1.sum()\n", + "v_2 /= v_2.sum()\n", + "D_ = D_.astype(np.double)\n", + "D_ /= D_.max() # just for comparison purposes\n", + "print(\"d(doc_1, doc_2) = {:.2f}\".format(emd(v_1, v_2, D_)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Document classification\n", + "\n", + "We will use the [*20 Newsgroups*](http://qwone.com/~jason/20Newsgroups/) classification task. Because WMD is an expensive computation, for this demo we just use a subset. To emphasize the power of the method, we use a larger test size, but train on relatively few samples." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "newsgroups = fetch_20newsgroups()\n", + "docs, y = newsgroups.data, newsgroups.target" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "docs_train, docs_test, y_train, y_test = train_test_split(docs, y,\n", + " train_size=100,\n", + " test_size=300,\n", + " random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the `W` embedding array is pretty huge, we might as well restrict it to just the words that actually occur in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vect = CountVectorizer(stop_words=\"english\").fit(docs_train + docs_test)\n", + "common = [word for word in vect.get_feature_names() if word in vocab_dict]\n", + "W_common = W[[vocab_dict[w] for w in common]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then create a fixed-vocabulary vectorizer using only the words we have embeddings for." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vect = CountVectorizer(vocabulary=common, dtype=np.double)\n", + "X_train = vect.fit_transform(docs_train)\n", + "X_test = vect.transform(docs_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One way to proceed is to just pre-compute the pairwise distances between all documents, and use them to search for hyperparameters and evaluate the model. However, that would incur some extra computation, and WMD is expensive. Also, it's not the most pleasant user interface. So we define some scikit-learn compatible estimators for computing the WMD.\n", + "\n", + "**`WordMoversKNN`** subclasses from `KNeighborsClassifier` and overrides the `predict` function to compute the WMD between all training and test samples.\n", + "\n", + "In practice, however, we often don't know what is the best `n_neighbors` to use. Simply wrapping `WordMoversKNN` in a `GridSearchCV` would be rather expensive because of all the distances that would need to be recomputed for every value of `n_neighbors`. So we introduce **`WordMoversKNNCV`**, which, when fitted, performs *cross-validation* to find the best value of `n_neighbors` (under any given evaluation metric), while only computing the WMD once per fold, and only across folds (saving `n_folds * fold_size ** 2` evaluations)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting word_movers_knn.py\n" + ] + } + ], + "source": [ + "\"\"\"%%file word_movers_knn.py\"\"\"\n", + "\n", + "# Authors: Vlad Niculae, Matt Kusner\n", + "# License: Simplified BSD\n", + "\n", + "import numpy as np\n", + "from sklearn.metrics import euclidean_distances\n", + "from sklearn.externals.joblib import Parallel, delayed\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.utils import check_array\n", + "from sklearn.cross_validation import check_cv\n", + "from sklearn.metrics.scorer import check_scoring\n", + "from sklearn.preprocessing import normalize\n", + "\n", + "from pyemd import emd\n", + "\n", + "\n", + "class WordMoversKNN(KNeighborsClassifier):\n", + " \"\"\"K nearest neighbors classifier using the Word Mover's Distance.\n", + "\n", + " Parameters\n", + " ----------\n", + " \n", + " W_embed : array, shape: (vocab_size, embed_size)\n", + " Precomputed word embeddings between vocabulary items.\n", + " Row indices should correspond to the columns in the bag-of-words input.\n", + "\n", + " n_neighbors : int, optional (default = 5)\n", + " Number of neighbors to use by default for :meth:`k_neighbors` queries.\n", + "\n", + " n_jobs : int, optional (default = 1)\n", + " The number of parallel jobs to run for Word Mover's Distance computation.\n", + " If ``-1``, then the number of jobs is set to the number of CPU cores.\n", + " \n", + " verbose : int, optional\n", + " Controls the verbosity; the higher, the more messages. Defaults to 0.\n", + " \n", + " References\n", + " ----------\n", + " \n", + " Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger\n", + " From Word Embeddings To Document Distances\n", + " The International Conference on Machine Learning (ICML), 2015\n", + " http://mkusner.github.io/publications/WMD.pdf\n", + " \n", + " \"\"\"\n", + " _pairwise = False\n", + "\n", + " def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False):\n", + " self.W_embed = W_embed\n", + " self.verbose = verbose\n", + " super(WordMoversKNN, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs,\n", + " metric='precomputed', algorithm='brute')\n", + "\n", + " def _wmd(self, i, row, X_train):\n", + " \"\"\"Compute the WMD between training sample i and given test row.\n", + " \n", + " Assumes that `row` and train samples are sparse BOW vectors summing to 1.\n", + " \"\"\"\n", + " union_idx = np.union1d(X_train[i].indices, row.indices)\n", + " W_minimal = self.W_embed[union_idx]\n", + " W_dist = euclidean_distances(W_minimal)\n", + " bow_i = X_train[i, union_idx].A.ravel()\n", + " bow_j = row[:, union_idx].A.ravel()\n", + " return emd(bow_i, bow_j, W_dist)\n", + " \n", + " def _wmd_row(self, row, X_train):\n", + " \"\"\"Wrapper to compute the WMD of a row with all training samples.\n", + " \n", + " Assumes that `row` and train samples are sparse BOW vectors summing to 1.\n", + " Useful for parallelization.\n", + " \"\"\"\n", + " n_samples_train = X_train.shape[0]\n", + " return [self._wmd(i, row, X_train) for i in range(n_samples_train)]\n", + "\n", + " def _pairwise_wmd(self, X_test, X_train=None):\n", + " \"\"\"Computes the word mover's distance between all train and test points.\n", + " \n", + " Parallelized over rows of X_test.\n", + " \n", + " Assumes that train and test samples are sparse BOW vectors summing to 1.\n", + " \n", + " Parameters\n", + " ----------\n", + " X_test: scipy.sparse matrix, shape: (n_test_samples, vocab_size)\n", + " Test samples.\n", + " \n", + " X_train: scipy.sparse matrix, shape: (n_train_samples, vocab_size)\n", + " Training samples. If `None`, uses the samples the estimator was fit with.\n", + " \n", + " Returns\n", + " -------\n", + " dist : array, shape: (n_test_samples, n_train_samples)\n", + " Distances between all test samples and all train samples.\n", + " \n", + " \"\"\"\n", + " n_samples_test = X_test.shape[0]\n", + " \n", + " if X_train is None:\n", + " X_train = self._fit_X\n", + "\n", + " dist = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(\n", + " delayed(self._wmd_row)(test_sample, X_train)\n", + " for test_sample in X_test)\n", + "\n", + " return np.array(dist)\n", + "\n", + " def fit(self, X, y):\n", + " \"\"\"Fit the model using X as training data and y as target values\n", + "\n", + " Parameters\n", + " ----------\n", + " X : scipy sparse matrix, shape: (n_samples, n_features)\n", + " Training data. \n", + "\n", + " y : {array-like, sparse matrix}\n", + " Target values of shape = [n_samples] or [n_samples, n_outputs]\n", + "\n", + " \"\"\"\n", + " X = check_array(X, accept_sparse='csr', copy=True)\n", + " X = normalize(X, norm='l1', copy=False)\n", + " return super(WordMoversKNN, self).fit(X, y)\n", + "\n", + " def predict(self, X):\n", + " \"\"\"Predict the class labels for the provided data\n", + " Parameters\n", + " ----------\n", + " X : scipy.sparse matrix, shape (n_test_samples, vocab_size)\n", + " Test samples.\n", + "\n", + " Returns\n", + " -------\n", + " y : array of shape [n_samples]\n", + " Class labels for each data sample.\n", + " \"\"\"\n", + " X = check_array(X, accept_sparse='csr', copy=True)\n", + " X = normalize(X, norm='l1', copy=False)\n", + " dist = self._pairwise_wmd(X)\n", + " return super(WordMoversKNN, self).predict(dist)\n", + " \n", + " \n", + "class WordMoversKNNCV(WordMoversKNN):\n", + " \"\"\"Cross-validated KNN classifier using the Word Mover's Distance.\n", + "\n", + " Parameters\n", + " ----------\n", + " W_embed : array, shape: (vocab_size, embed_size)\n", + " Precomputed word embeddings between vocabulary items.\n", + " Row indices should correspond to the columns in the bag-of-words input.\n", + "\n", + " n_neighbors_try : sequence, optional\n", + " List of ``n_neighbors`` values to try.\n", + " If None, tries 1-5 neighbors.\n", + "\n", + " scoring : string, callable or None, optional, default: None\n", + " A string (see model evaluation documentation) or\n", + " a scorer callable object / function with signature\n", + " ``scorer(estimator, X, y)``.\n", + "\n", + " cv : int, cross-validation generator or an iterable, optional\n", + " Determines the cross-validation splitting strategy.\n", + " Possible inputs for cv are:\n", + " - None, to use the default 3-fold cross-validation,\n", + " - integer, to specify the number of folds.\n", + " - An object to be used as a cross-validation generator.\n", + " - An iterable yielding train/test splits.\n", + " For integer/None inputs, StratifiedKFold is used.\n", + "\n", + " n_jobs : int, optional (default = 1)\n", + " The number of parallel jobs to run for Word Mover's Distance computation.\n", + " If ``-1``, then the number of jobs is set to the number of CPU cores.\n", + "\n", + " verbose : int, optional\n", + " Controls the verbosity; the higher, the more messages. Defaults to 0.\n", + "\n", + " Attributes\n", + " ----------\n", + " cv_scores_ : array, shape (n_folds, len(n_neighbors_try))\n", + " Test set scores for each fold.\n", + "\n", + " n_neighbors_ : int,\n", + " The best `n_neighbors` value found.\n", + "\n", + " References\n", + " ----------\n", + "\n", + " Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger\n", + " From Word Embeddings To Document Distances\n", + " The International Conference on Machine Learning (ICML), 2015\n", + " http://mkusner.github.io/publications/WMD.pdf\n", + " \n", + " \"\"\"\n", + " def __init__(self, W_embed, n_neighbors_try=None, scoring=None, cv=3,\n", + " n_jobs=1, verbose=False):\n", + " self.cv = cv\n", + " self.n_neighbors_try = n_neighbors_try\n", + " self.scoring = scoring\n", + " super(WordMoversKNNCV, self).__init__(W_embed,\n", + " n_neighbors=None,\n", + " n_jobs=n_jobs,\n", + " verbose=verbose)\n", + "\n", + " def fit(self, X, y):\n", + " \"\"\"Fit KNN model by choosing the best `n_neighbors`.\n", + " \n", + " Parameters\n", + " -----------\n", + " X : scipy.sparse matrix, (n_samples, vocab_size)\n", + " Data\n", + " y : ndarray, shape (n_samples,) or (n_samples, n_targets)\n", + " Target\n", + " \"\"\"\n", + " if self.n_neighbors_try is None:\n", + " n_neighbors_try = range(1, 6)\n", + " else:\n", + " n_neighbors_try = self.n_neighbors_try\n", + "\n", + " X = check_array(X, accept_sparse='csr', copy=True)\n", + " X = normalize(X, norm='l1', copy=False)\n", + "\n", + " cv = check_cv(self.cv, X, y)\n", + " knn = KNeighborsClassifier(metric='precomputed', algorithm='brute')\n", + " scorer = check_scoring(knn, scoring=self.scoring)\n", + "\n", + " scores = []\n", + " for train_ix, test_ix in cv:\n", + " dist = self._pairwise_wmd(X[test_ix], X[train_ix])\n", + " knn.fit(X[train_ix], y[train_ix])\n", + " scores.append([\n", + " scorer(knn.set_params(n_neighbors=k), dist, y[test_ix])\n", + " for k in n_neighbors_try\n", + " ])\n", + " scores = np.array(scores)\n", + " self.cv_scores_ = scores\n", + "\n", + " best_k_ix = np.argmax(np.mean(scores, axis=0))\n", + " best_k = n_neighbors_try[best_k_ix]\n", + " self.n_neighbors = self.n_neighbors_ = best_k\n", + "\n", + " return super(WordMoversKNNCV, self).fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 30.8s\n", + "[Parallel(n_jobs=3)]: Done 34 out of 34 | elapsed: 2.0min finished\n", + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 25.7s\n", + "[Parallel(n_jobs=3)]: Done 33 out of 33 | elapsed: 2.9min finished\n", + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 53.3s\n", + "[Parallel(n_jobs=3)]: Done 33 out of 33 | elapsed: 2.0min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "WordMoversKNNCV(W_embed=memmap([[ 0.04283, -0.01124, ..., -0.05679, -0.00763],\n", + " [ 0.02884, -0.05923, ..., -0.04744, 0.06698],\n", + " ...,\n", + " [ 0.08428, -0.15534, ..., -0.01413, 0.04561],\n", + " [-0.02052, 0.08666, ..., 0.03659, 0.10445]]),\n", + " cv=3, n_jobs=3, n_neighbors_try=range(1, 20), scoring=None,\n", + " verbose=5)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_cv = WordMoversKNNCV(cv=3,\n", + " n_neighbors_try=range(1, 20),\n", + " W_embed=W_common, verbose=5, n_jobs=3)\n", + "knn_cv.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV score: 0.38\n" + ] + } + ], + "source": [ + "print(\"CV score: {:.2f}\".format(knn_cv.cv_scores_.mean(axis=0).max()))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 32.2s\n", + "[Parallel(n_jobs=3)]: Done 66 tasks | elapsed: 4.3min\n", + "[Parallel(n_jobs=3)]: Done 156 tasks | elapsed: 12.5min\n", + "[Parallel(n_jobs=3)]: Done 282 tasks | elapsed: 30.5min\n", + "[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed: 48.9min finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test score: 0.31\n" + ] + } + ], + "source": [ + "print(\"Test score: {:.2f}\".format(knn_cv.score(X_test, y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparison with other models\n", + "\n", + "Now let's see how WMD compares with some common approaches, on bag of words features. The most apples-to-apples comparison would be\n", + "K nearest neighbors with a cosine similarity metric. This approach performs worse than using WMD. (All scores are accuracies.)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.svm import LinearSVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.grid_search import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV score: 0.34\n", + "Test score: 0.22\n" + ] + } + ], + "source": [ + "knn_grid = GridSearchCV(KNeighborsClassifier(metric='cosine', algorithm='brute'),\n", + " dict(n_neighbors=list(range(1, 20))),\n", + " cv=3)\n", + "knn_grid.fit(X_train, y_train)\n", + "print(\"CV score: {:.2f}\".format(knn_grid.best_score_))\n", + "print(\"Test score: {:.2f}\".format(knn_grid.score(X_test, y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another common method for text classification is the linear support vector machine on bag of words.\n", + "This performs a bit better than vanilla cosine KNN, but worse than using WMD in this setting. In our experience,\n", + "this seems to depend on the amount of training data available." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV score: 0.35\n", + "Test score: 0.27\n" + ] + } + ], + "source": [ + "svc_grid = GridSearchCV(LinearSVC(),\n", + " dict(C=np.logspace(-6, 6, 13, base=2)),\n", + " cv=3)\n", + "svc_grid.fit(X_train, y_train)\n", + "print(\"CV score: {:.2f}\".format(svc_grid.best_score_))\n", + "print(\"Test score: {:.2f}\".format(svc_grid.score(X_test, y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What have we learned?\n", + "\n", + "WMD is much better at capturing semantic similarity between documents than cosine, due to its ability to generalize to unseen words. The SVM does somewhat better than cosine KNN, but still lacks such out-of-vocabulary generalization. Given enough data, WMD can probably improve this margin, especially using something like metric learning on top.\n", + "\n", + "The exact WMD, as we have used it here, is pretty slow. This code is not optimized as much as it could be, there is potential through caching and using Cython.\n", + "However, a major limitation remains the cost of actually computing the EMD. To scale even higher, exactness can be relaxed by using lower bounds. In our next post, we will compare such optimization strategies, as discussed in [the WMD paper](http://mkusner.github.io/publications/WMD.pdf)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/05-02-2017 Word Movers Distance Measures Based on Experiments.ipynb b/notebooks/05-02-2017 Word Movers Distance Measures Based on Experiments.ipynb new file mode 100644 index 0000000..e704bd5 --- /dev/null +++ b/notebooks/05-02-2017 Word Movers Distance Measures Based on Experiments.ipynb @@ -0,0 +1,1267 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Similarity between experiments in INTACT papers\n", + "\n", + "Here we examine the analysis of text tagged for different experiments to see if there is a discernable difference between the sentences from different experiments. \n", + "\n", + "We should also perhaps attempt to annotate for entities in each sentence as well." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import gensim\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Unnamed: 0SentenceIdClause TextCodesExperimentValuesParagraphHeadingsFloatingBox?Discourse TypeOffset_BeginOffset_Endfig_spans
00s1Involvement of Pex13p in Pex14p Localization a...[]NaN-NaNFalsenone127248NaN
11s2Pex13p is the putative docking protein for per...[]NaNp3NaNFalsefact799922NaN
22s3Pex14p interacts with both the PTS1 - and PTS2...[]NaNp3NaNFalsefact9231077NaN
33s4We report the involvement of Pex13p in peroxis...[]NaNp3NaNFalsefact10781164NaN
44s5Like Pex14p ,[]NaNp3NaNFalsenone11651177NaN
55s5Pex13p not only interacts with the PTS1-recept...[]NaNp3NaNFalsefact11781325NaN
66s6In support of distinct peroxisomal binding sit...[]NaNp3NaNFalsehypothesis13261462NaN
77s6evidence for the interaction of Pex7p and Pex1...[]NaNp3NaNFalseimplication14631619NaN
88s7Accordingly , we conclude[]NaNp3NaNFalsehypothesis16201644NaN
99s7that Pex7p and Pex13p functionally interact du...[]NaNp3NaNFalsehypothesis16451743NaN
1010s8NH2-terminal regions of Pex13p are required fo...[]NaNp3NaNFalseimplication17441830NaN
1111s8while the COOH-terminal SH3 domain alone is su...[]NaNp3NaNFalseimplication18311936NaN
1212s9Reinvestigation of the topology revealed both ...[]NaNp3NaNFalseresult19372000NaN
1313s9to be oriented towards the cytosol .[]NaNp3NaNFalseimplication20012036NaN
1414s10We also found Pex13p to be required for peroxi...[]NaNp3NaNFalseresult20372115NaN
1515s10the SH3 domain of Pex13p may not provide the o...[]NaNp3NaNFalseimplication21162218NaN
1616s11Peroxisomal matrix proteins are synthesized on...[exLink]NaNp4NaNFalsefact22672419NaN
1717s12The presence of two distinct peroxisomal targe...[]NaNp4NaNFalsefact24202583NaN
1818s13PTS1 , present in the majority of peroxisomal ...[]NaNp4NaNFalsefact25842736NaN
1919s13for review see McNew and Goodman , 1996 ) .[exLink]NaNp4NaNFalsefact27372777NaN
2020s14Only one known peroxisomal matrix protein in S...[]NaNp4NaNFalsefact27782901NaN
2121s14which is typically localized close to the NH2 ...[exLink]NaNp4NaNFalsefact29023086NaN
2222s15Recognition of PTS1 and PTS2 targeting signals...[]NaNp5NaNFalsefact30873215NaN
2323s15for review see Subramani , 1996 ; Erdmann et a...[exLink]NaNp5NaNFalsefact32163270NaN
2424s16Cells deficient in either protein display part...[]NaNp5NaNFalsefact32713341NaN
2525s17pex5Delta cells correctly localize PTS2 protei...[]NaNp5NaNFalsefact33423444NaN
2626s18pex7Delta cells exhibit the reverse phenotype[]NaNp5NaNFalseresult34453491NaN
2727s18for review see Elgersma and Tabak , 1996 ) .[exLink]NaNp5NaNFalseresult34923533NaN
2828s19The intracellular localization of both targeti...[]NaNp5NaNFalseimplication35343628NaN
2929s20A predominantly cytosolic , membrane-bound , a...[]NaNp5NaNFalseresult36293749NaN
.......................................
413413s263a Pex5p/Pex7p two-hybrid interaction is not ob...[inLink]f1p71DiscussionFalseresult4060440681NaN
414414s264At first , this observation seems rather surpr...[]NaNp71DiscussionFalseimplication4068240733NaN
415415s264since both Pex5p and Pex7p independently inter...[inLink]f4p71DiscussionFalseresult4073440830NaN
416416s265One could imagine[]NaNp71DiscussionFalsehypothesis4083140848NaN
417417s265that Pex13p may serve as a bridging molecule b...[]NaNp71DiscussionFalsehypothesis4084941004NaN
418418s266However , the amount of Pex5p simultaneously a...[]NaNp71DiscussionFalsehypothesis4100541104NaN
419419s266to give a positive response .[]NaNp71DiscussionFalsehypothesis4110541133NaN
420420s267In support of this assumption , the amount of ...[inLink]f3p71DiscussionFalseimplication4113441345NaN
421421s268Perhaps Pex13p does not usually associate simu...[]NaNp71DiscussionFalseimplication4134641436NaN
422422s268or association is transient .[]NaNp71DiscussionFalseresult4143741466NaN
423423s269The domain of Pex13 that interacts with Pex7p ...[]NaNp72DiscussionFalsefact4146741561NaN
424424s269where the interaction occurs , remains unknown .[]NaNp72DiscussionFalsefact4156241608NaN
425425s270Furthermore , the intracellular localization o...[]NaNp72DiscussionFalsefact4160941699NaN
426426s271One group has reported[]NaNp72DiscussionFalseresult4170041722NaN
427427s271that the protein is exclusively localized in t...[exLink]NaNp72DiscussionFalseresult4172342008NaN
428428s272Because the SH3 domain alone does not mediate ...[]NaNp72DiscussionFalseimplication4200942093NaN
429429s272that regions NH2-terminal of the SH3 domain ma...[]NaNp72DiscussionFalsehypothesis4209442236NaN
430430s273Previously , the COOH-terminal SH3 domain has ...[exLink]NaNp72DiscussionFalseresult4223742364NaN
431431s273we found that both the NH2 terminus and the CO...[inLink]f6p72DiscussionFalseresult4236542559NaN
432432s274In this respect , it is interesting[]NaNp72DiscussionFalseimplication4256042594NaN
433433s274to note that two regions which would fulfill t...[exLink]NaNp72DiscussionFalsehypothesis4259542743NaN
434434s275The interaction of Pex13p with Pex7p has far[]NaNp73DiscussionFalsefact4274442788NaN
435435s275reaching implications for our understanding of...[]NaNp73DiscussionFalseimplication4278942879NaN
436436s276Why are there several binding sites for the im...[]NaNp73DiscussionFalsefact4288042969NaN
437437s277One hypothesis suggests[]NaNp73DiscussionFalsehypothesis4297042993NaN
438438s277that the multiple interactions reflect the exi...[exLink]NaNp73DiscussionFalsehypothesis4299443203NaN
439439s278Our confirmation that at least two peroxisomal...[]NaNp73DiscussionFalsehypothesis4320443303NaN
440440s278about which functions as the docking protein f...[]NaNp73DiscussionFalsehypothesis4330443381NaN
441441s279Experimental evidence that Pex13p may be the d...[exLink]NaNp73DiscussionFalsehypothesis4338243508NaN
442442s279the unsolved questions stress the need for rel...[]NaNp73DiscussionFalsehypothesis4350943662NaN
\n", + "

443 rows × 12 columns

\n", + "
" + ], + "text/plain": [ + " Unnamed: 0 SentenceId Clause Text \\\n", + "0 0 s1 Involvement of Pex13p in Pex14p Localization a... \n", + "1 1 s2 Pex13p is the putative docking protein for per... \n", + "2 2 s3 Pex14p interacts with both the PTS1 - and PTS2... \n", + "3 3 s4 We report the involvement of Pex13p in peroxis... \n", + "4 4 s5 Like Pex14p , \n", + "5 5 s5 Pex13p not only interacts with the PTS1-recept... \n", + "6 6 s6 In support of distinct peroxisomal binding sit... \n", + "7 7 s6 evidence for the interaction of Pex7p and Pex1... \n", + "8 8 s7 Accordingly , we conclude \n", + "9 9 s7 that Pex7p and Pex13p functionally interact du... \n", + "10 10 s8 NH2-terminal regions of Pex13p are required fo... \n", + "11 11 s8 while the COOH-terminal SH3 domain alone is su... \n", + "12 12 s9 Reinvestigation of the topology revealed both ... \n", + "13 13 s9 to be oriented towards the cytosol . \n", + "14 14 s10 We also found Pex13p to be required for peroxi... \n", + "15 15 s10 the SH3 domain of Pex13p may not provide the o... \n", + "16 16 s11 Peroxisomal matrix proteins are synthesized on... \n", + "17 17 s12 The presence of two distinct peroxisomal targe... \n", + "18 18 s13 PTS1 , present in the majority of peroxisomal ... \n", + "19 19 s13 for review see McNew and Goodman , 1996 ) . \n", + "20 20 s14 Only one known peroxisomal matrix protein in S... \n", + "21 21 s14 which is typically localized close to the NH2 ... \n", + "22 22 s15 Recognition of PTS1 and PTS2 targeting signals... \n", + "23 23 s15 for review see Subramani , 1996 ; Erdmann et a... \n", + "24 24 s16 Cells deficient in either protein display part... \n", + "25 25 s17 pex5Delta cells correctly localize PTS2 protei... \n", + "26 26 s18 pex7Delta cells exhibit the reverse phenotype \n", + "27 27 s18 for review see Elgersma and Tabak , 1996 ) . \n", + "28 28 s19 The intracellular localization of both targeti... \n", + "29 29 s20 A predominantly cytosolic , membrane-bound , a... \n", + ".. ... ... ... \n", + "413 413 s263 a Pex5p/Pex7p two-hybrid interaction is not ob... \n", + "414 414 s264 At first , this observation seems rather surpr... \n", + "415 415 s264 since both Pex5p and Pex7p independently inter... \n", + "416 416 s265 One could imagine \n", + "417 417 s265 that Pex13p may serve as a bridging molecule b... \n", + "418 418 s266 However , the amount of Pex5p simultaneously a... \n", + "419 419 s266 to give a positive response . \n", + "420 420 s267 In support of this assumption , the amount of ... \n", + "421 421 s268 Perhaps Pex13p does not usually associate simu... \n", + "422 422 s268 or association is transient . \n", + "423 423 s269 The domain of Pex13 that interacts with Pex7p ... \n", + "424 424 s269 where the interaction occurs , remains unknown . \n", + "425 425 s270 Furthermore , the intracellular localization o... \n", + "426 426 s271 One group has reported \n", + "427 427 s271 that the protein is exclusively localized in t... \n", + "428 428 s272 Because the SH3 domain alone does not mediate ... \n", + "429 429 s272 that regions NH2-terminal of the SH3 domain ma... \n", + "430 430 s273 Previously , the COOH-terminal SH3 domain has ... \n", + "431 431 s273 we found that both the NH2 terminus and the CO... \n", + "432 432 s274 In this respect , it is interesting \n", + "433 433 s274 to note that two regions which would fulfill t... \n", + "434 434 s275 The interaction of Pex13p with Pex7p has far \n", + "435 435 s275 reaching implications for our understanding of... \n", + "436 436 s276 Why are there several binding sites for the im... \n", + "437 437 s277 One hypothesis suggests \n", + "438 438 s277 that the multiple interactions reflect the exi... \n", + "439 439 s278 Our confirmation that at least two peroxisomal... \n", + "440 440 s278 about which functions as the docking protein f... \n", + "441 441 s279 Experimental evidence that Pex13p may be the d... \n", + "442 442 s279 the unsolved questions stress the need for rel... \n", + "\n", + " Codes ExperimentValues Paragraph Headings FloatingBox? \\\n", + "0 [] NaN - NaN False \n", + "1 [] NaN p3 NaN False \n", + "2 [] NaN p3 NaN False \n", + "3 [] NaN p3 NaN False \n", + "4 [] NaN p3 NaN False \n", + "5 [] NaN p3 NaN False \n", + "6 [] NaN p3 NaN False \n", + "7 [] NaN p3 NaN False \n", + "8 [] NaN p3 NaN False \n", + "9 [] NaN p3 NaN False \n", + "10 [] NaN p3 NaN False \n", + "11 [] NaN p3 NaN False \n", + "12 [] NaN p3 NaN False \n", + "13 [] NaN p3 NaN False \n", + "14 [] NaN p3 NaN False \n", + "15 [] NaN p3 NaN False \n", + "16 [exLink] NaN p4 NaN False \n", + "17 [] NaN p4 NaN False \n", + "18 [] NaN p4 NaN False \n", + "19 [exLink] NaN p4 NaN False \n", + "20 [] NaN p4 NaN False \n", + "21 [exLink] NaN p4 NaN False \n", + "22 [] NaN p5 NaN False \n", + "23 [exLink] NaN p5 NaN False \n", + "24 [] NaN p5 NaN False \n", + "25 [] NaN p5 NaN False \n", + "26 [] NaN p5 NaN False \n", + "27 [exLink] NaN p5 NaN False \n", + "28 [] NaN p5 NaN False \n", + "29 [] NaN p5 NaN False \n", + ".. ... ... ... ... ... \n", + "413 [inLink] f1 p71 Discussion False \n", + "414 [] NaN p71 Discussion False \n", + "415 [inLink] f4 p71 Discussion False \n", + "416 [] NaN p71 Discussion False \n", + "417 [] NaN p71 Discussion False \n", + "418 [] NaN p71 Discussion False \n", + "419 [] NaN p71 Discussion False \n", + "420 [inLink] f3 p71 Discussion False \n", + "421 [] NaN p71 Discussion False \n", + "422 [] NaN p71 Discussion False \n", + "423 [] NaN p72 Discussion False \n", + "424 [] NaN p72 Discussion False \n", + "425 [] NaN p72 Discussion False \n", + "426 [] NaN p72 Discussion False \n", + "427 [exLink] NaN p72 Discussion False \n", + "428 [] NaN p72 Discussion False \n", + "429 [] NaN p72 Discussion False \n", + "430 [exLink] NaN p72 Discussion False \n", + "431 [inLink] f6 p72 Discussion False \n", + "432 [] NaN p72 Discussion False \n", + "433 [exLink] NaN p72 Discussion False \n", + "434 [] NaN p73 Discussion False \n", + "435 [] NaN p73 Discussion False \n", + "436 [] NaN p73 Discussion False \n", + "437 [] NaN p73 Discussion False \n", + "438 [exLink] NaN p73 Discussion False \n", + "439 [] NaN p73 Discussion False \n", + "440 [] NaN p73 Discussion False \n", + "441 [exLink] NaN p73 Discussion False \n", + "442 [] NaN p73 Discussion False \n", + "\n", + " Discourse Type Offset_Begin Offset_End fig_spans \n", + "0 none 127 248 NaN \n", + "1 fact 799 922 NaN \n", + "2 fact 923 1077 NaN \n", + "3 fact 1078 1164 NaN \n", + "4 none 1165 1177 NaN \n", + "5 fact 1178 1325 NaN \n", + "6 hypothesis 1326 1462 NaN \n", + "7 implication 1463 1619 NaN \n", + "8 hypothesis 1620 1644 NaN \n", + "9 hypothesis 1645 1743 NaN \n", + "10 implication 1744 1830 NaN \n", + "11 implication 1831 1936 NaN \n", + "12 result 1937 2000 NaN \n", + "13 implication 2001 2036 NaN \n", + "14 result 2037 2115 NaN \n", + "15 implication 2116 2218 NaN \n", + "16 fact 2267 2419 NaN \n", + "17 fact 2420 2583 NaN \n", + "18 fact 2584 2736 NaN \n", + "19 fact 2737 2777 NaN \n", + "20 fact 2778 2901 NaN \n", + "21 fact 2902 3086 NaN \n", + "22 fact 3087 3215 NaN \n", + "23 fact 3216 3270 NaN \n", + "24 fact 3271 3341 NaN \n", + "25 fact 3342 3444 NaN \n", + "26 result 3445 3491 NaN \n", + "27 result 3492 3533 NaN \n", + "28 implication 3534 3628 NaN \n", + "29 result 3629 3749 NaN \n", + ".. ... ... ... ... \n", + "413 result 40604 40681 NaN \n", + "414 implication 40682 40733 NaN \n", + "415 result 40734 40830 NaN \n", + "416 hypothesis 40831 40848 NaN \n", + "417 hypothesis 40849 41004 NaN \n", + "418 hypothesis 41005 41104 NaN \n", + "419 hypothesis 41105 41133 NaN \n", + "420 implication 41134 41345 NaN \n", + "421 implication 41346 41436 NaN \n", + "422 result 41437 41466 NaN \n", + "423 fact 41467 41561 NaN \n", + "424 fact 41562 41608 NaN \n", + "425 fact 41609 41699 NaN \n", + "426 result 41700 41722 NaN \n", + "427 result 41723 42008 NaN \n", + "428 implication 42009 42093 NaN \n", + "429 hypothesis 42094 42236 NaN \n", + "430 result 42237 42364 NaN \n", + "431 result 42365 42559 NaN \n", + "432 implication 42560 42594 NaN \n", + "433 hypothesis 42595 42743 NaN \n", + "434 fact 42744 42788 NaN \n", + "435 implication 42789 42879 NaN \n", + "436 fact 42880 42969 NaN \n", + "437 hypothesis 42970 42993 NaN \n", + "438 hypothesis 42994 43203 NaN \n", + "439 hypothesis 43204 43303 NaN \n", + "440 hypothesis 43304 43381 NaN \n", + "441 hypothesis 43382 43508 NaN \n", + "442 hypothesis 43509 43662 NaN \n", + "\n", + "[443 rows x 12 columns]" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "inFile = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/tsv_span/10087260_spans.tsv'\n", + "#/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/pathwayLogic/scidt_bioc_sentences_tsv/11777939.tsv'\n", + "tsv = pd.read_csv(inFile, sep='\\t')\n", + "\n", + "tsv" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " fig_spans = row['fig_spans']\n", + " \n", + " j += 1\n", + " if(reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if ('methods' in heading.lower()):\n", + " continue\n", + " \n", + " r = 'X'\n", + " if(reachData != reachData):\n", + " r = '0'\n", + " \n", + " if(reachData == reachData):\n", + " hits += 1\n", + "\n", + " print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text ) \n", + " \n", + " text = re.sub(regex1,\"\",text)\n", + " sent = regex2.split(text)\n", + " sent = [w for w in sent if w not in stopwords and len(w)>0]\n", + " sentences.append(sent)\n", + "\n", + " if 'exLink' in codeStr:\n", + " continue\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/05-03-17 Building CoSID frames as Linked Data..ipynb b/notebooks/05-03-17 Building CoSID frames as Linked Data..ipynb new file mode 100644 index 0000000..91cd8ef --- /dev/null +++ b/notebooks/05-03-17 Building CoSID frames as Linked Data..ipynb @@ -0,0 +1,34 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/Analyzing Evidence + Claim Sentences .ipynb b/notebooks/Analyzing Evidence + Claim Sentences .ipynb new file mode 100644 index 0000000..5b95563 --- /dev/null +++ b/notebooks/Analyzing Evidence + Claim Sentences .ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "def retrieve_sentences_for_modeling(inFile, fid):\n", + " tsv = pd.read_csv(inFile, sep='\\t')\n", + " sentences = []\n", + "\n", + " sw = stopwords.words('english')\n", + " regex1 = re.compile(r\"[\\(\\)\\{\\}\\[\\]\\;\\.\\'\\\"\\,\\/\\_\\*]\", re.IGNORECASE)\n", + " regex2 = re.compile(r\"\\s+\", re.IGNORECASE)\n", + "\n", + " allHits = 0\n", + " hits = 0\n", + " j = 0\n", + " for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + "\n", + " j += 1\n", + " if (reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if 'exLink' in codeStr:\n", + " continue\n", + "\n", + " if ('methods' in str(heading).lower()):\n", + " continue\n", + "\n", + " r = 'X'\n", + " if (reachData != reachData):\n", + " r = '0'\n", + "\n", + " if (reachData == reachData):\n", + " hits += 1\n", + "\n", + " # print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text )\n", + "\n", + " text = re.sub(regex1, \"\", text)\n", + " sent = regex2.split(text)\n", + " sent = [w for w in sent if w not in sw and len(w)>0]\n", + " tup = (fid, sid, sent)\n", + " sentences.append(tup)\n", + "\n", + " return sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "from ipywidgets import FloatProgress\n", + "from IPython.display import display\n", + "\n", + "inDir = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4/'\n", + "sent_tup_list = []\n", + "\n", + "f = FloatProgress(min=0, max=100)\n", + "display(f)\n", + "\n", + "sent_list = []\n", + "for fn in os.listdir(inDir):\n", + " infile = inDir + \"/\" + fn\n", + " if (os.path.isfile(infile) and fn.endswith('.tsv')):\n", + " fid = fn.replace(\".tsv\", \"\")\n", + " f.value += 1\n", + " for tup in retrieve_sentences_for_modeling(infile, fid):\n", + " sent_list.append(tup[2]);" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim import corpora\n", + "\n", + "dictionary = corpora.Dictionary(sent_list)\n", + "#dictionary.save('/tmp/deerwester.dict')\n" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "corpus = [dictionary.doc2bow(sent) for sent in sent_list]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "dictionary.save(inDir + '/sent.dict')\n", + "corpora.MmCorpus.serialize(inDir + '/sent.mm', corpus)" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "mm = corpora.MmCorpus(inDir + '/sent.mm')" + ] + }, + { + "cell_type": "code", + "execution_count": 50, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "MmCorpus(88353 documents, 72629 features, 1287192 non-zero entries)\n" + ] + } + ], + "source": [ + "print(mm)" + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models.ldamodel import LdaModel\n", + "lda = LdaModel(corpus=mm, id2word=dictionary, num_topics=100, update_every=1, chunksize=10000, passes=5)" + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[(0,\n", + " u'0.062*\"family\" + 0.045*\"defects\" + 0.043*\"members\" + 0.027*\"act\" + 0.022*\"proteins\" + 0.017*\"neurite\" + 0.016*\"subset\" + 0.014*\"chaperone\" + 0.013*\"percentage\" + 0.013*\"pronounced\"'),\n", + " (1,\n", + " u'0.058*\"even\" + 0.042*\"detect\" + 0.039*\"possibility\" + 0.027*\"partners\" + 0.022*\"abundance\" + 0.022*\"proteins\" + 0.020*\"interaction\" + 0.020*\"though\" + 0.019*\"We\" + 0.019*\"physical\"'),\n", + " (2,\n", + " u'0.111*\"residues\" + 0.076*\"Figures\" + 0.054*\"site\" + 0.052*\"conserved\" + 0.042*\"highly\" + 0.034*\"binding\" + 0.022*\"The\" + 0.018*\"critical\" + 0.017*\"region\" + 0.016*\"peptide\"'),\n", + " (3,\n", + " u'0.058*\"subunits\" + 0.047*\"subunit\" + 0.036*\"catalytic\" + 0.033*\"vesicles\" + 0.024*\"marker\" + 0.023*\"infected\" + 0.018*\"myostatin\" + 0.017*\"added\" + 0.017*\"module\" + 0.016*\"latent\"'),\n", + " (4,\n", + " u'0.110*\"Figure\" + 0.081*\"S1\" + 0.056*\"1A\" + 0.024*\"weak\" + 0.021*\"2E\" + 0.019*\"embryos\" + 0.018*\"potentially\" + 0.018*\"alanine\" + 0.018*\"dephosphorylation\" + 0.015*\"upregulated\"'),\n", + " (5,\n", + " u'0.078*\"cells\" + 0.059*\"endogenous\" + 0.050*\"staining\" + 0.031*\"nucleus\" + 0.030*\"pattern\" + 0.026*\"cytoplasm\" + 0.026*\"localized\" + 0.023*\"Fig\" + 0.021*\"ubiquitination\" + 0.016*\"HeLa\"'),\n", + " (6,\n", + " u'0.057*\"indicates\" + 0.048*\"Additional\" + 0.047*\"treated\" + 0.040*\"Consistent\" + 0.037*\"file\" + 0.036*\"cells\" + 0.030*\"measured\" + 0.025*\"Figure\" + 0.022*\"1\" + 0.019*\"ERK\"'),\n", + " (7,\n", + " u'0.102*\"used\" + 0.052*\"major\" + 0.031*\"associate\" + 0.024*\"proteins\" + 0.020*\"protein\" + 0.019*\"One\" + 0.019*\"IgG\" + 0.017*\"stably\" + 0.016*\"recognized\" + 0.015*\"cancers\"'),\n", + " (8,\n", + " u'0.147*\"role\" + 0.042*\"differences\" + 0.041*\"important\" + 0.038*\"substrate\" + 0.026*\"extent\" + 0.024*\"plays\" + 0.022*\"central\" + 0.022*\"--\" + 0.020*\"The\" + 0.020*\"play\"'),\n", + " (9,\n", + " u'0.076*\"domains\" + 0.051*\"motif\" + 0.040*\"domain\" + 0.040*\"binding\" + 0.023*\"specificity\" + 0.023*\"SH3\" + 0.020*\"proteins\" + 0.019*\"putative\" + 0.017*\"contain\" + 0.015*\"PKA\"'),\n", + " (10,\n", + " u'0.092*\"mutant\" + 0.047*\"single\" + 0.041*\"actin\" + 0.034*\"mutants\" + 0.022*\"Further\" + 0.022*\"double\" + 0.019*\"defect\" + 0.018*\"isoform\" + 0.016*\"towards\" + 0.015*\"cytoskeleton\"'),\n", + " (11,\n", + " u'0.052*\"005\" + 0.026*\"Tau\" + 0.022*\"abrogated\" + 0.022*\"Dab2\" + 0.022*\"Rab7\" + 0.021*\"LRP6\" + 0.019*\"Wnt\" + 0.018*\"signaling\" + 0.017*\"c-Met\" + 0.017*\"Lyn\"'),\n", + " (12,\n", + " u'0.089*\"panel\" + 0.058*\"Fig\" + 0.039*\"right\" + 0.034*\"left\" + 0.029*\"panels\" + 0.026*\"experiment\" + 0.023*\"top\" + 0.023*\"Figs\" + 0.020*\"retained\" + 0.020*\"bottom\"'),\n", + " (13,\n", + " u'0.077*\"WT\" + 0.045*\"cells\" + 0.044*\"obtained\" + 0.044*\"lower\" + 0.031*\"data\" + 0.031*\"results\" + 0.027*\"Similar\" + 0.024*\"much\" + 0.023*\"wt\" + 0.020*\"MEFs\"'),\n", + " (14,\n", + " u'0.150*\"interactions\" + 0.040*\"proteins\" + 0.029*\"structural\" + 0.024*\"motifs\" + 0.023*\"peptides\" + 0.022*\"interface\" + 0.022*\"interaction\" + 0.020*\"rather\" + 0.019*\"common\" + 0.018*\"The\"'),\n", + " (15,\n", + " u'0.064*\"size\" + 0.050*\"case\" + 0.029*\"agreement\" + 0.028*\"sensitive\" + 0.028*\"regulatory\" + 0.027*\"In\" + 0.025*\"muscle\" + 0.025*\"reduces\" + 0.022*\"clathrin\" + 0.022*\"3E\"'),\n", + " (16,\n", + " u'0.144*\"levels\" + 0.077*\"expression\" + 0.042*\"mRNA\" + 0.037*\"Figure\" + 0.036*\"low\" + 0.031*\"high\" + 0.028*\"normal\" + 0.028*\"protein\" + 0.019*\"level\" + 0.017*\"3C\"'),\n", + " (17,\n", + " u'0.090*\"type\" + 0.071*\"inhibitor\" + 0.052*\"exhibited\" + 0.044*\"wild\" + 0.028*\"mutant\" + 0.025*\"analyzed\" + 0.023*\"III\" + 0.022*\"efficiency\" + 0.020*\"lead\" + 0.019*\"defective\"'),\n", + " (18,\n", + " u'0.052*\"using\" + 0.051*\"antibody\" + 0.038*\"confirmed\" + 0.035*\"detected\" + 0.035*\"Figure\" + 0.027*\"analysis\" + 0.025*\"Fig\" + 0.021*\"lysates\" + 0.020*\"shown\" + 0.020*\"extracts\"'),\n", + " (19,\n", + " u'0.060*\"manner\" + 0.043*\"patients\" + 0.032*\"amounts\" + 0.030*\"processing\" + 0.026*\"increasing\" + 0.019*\"trafficking\" + 0.018*\"disruption\" + 0.017*\"dose-dependent\" + 0.017*\"unaffected\" + 0.017*\"exposure\"'),\n", + " (20,\n", + " u'0.070*\"positive\" + 0.069*\"p\" + 0.048*\"ER\" + 0.035*\"contained\" + 0.034*\"altered\" + 0.033*\"ratio\" + 0.023*\"7C\" + 0.020*\"effective\" + 0.019*\"pairs\" + 0.015*\"inhibiting\"'),\n", + " (21,\n", + " u'0.164*\"domain\" + 0.080*\"additional\" + 0.064*\"region\" + 0.063*\"data\" + 0.056*\"file\" + 0.054*\"Click\" + 0.039*\"C-terminal\" + 0.034*\"N-terminal\" + 0.017*\"part\" + 0.017*\"binding\"'),\n", + " (22,\n", + " u'0.053*\"following\" + 0.035*\"infection\" + 0.035*\"G\" + 0.034*\"cells\" + 0.028*\"virus\" + 0.028*\"remained\" + 0.018*\"H\" + 0.018*\"Fig\" + 0.016*\"particles\" + 0.015*\"incubated\"'),\n", + " (23,\n", + " u'0.107*\"increased\" + 0.082*\"significantly\" + 0.063*\"cells\" + 0.046*\"compared\" + 0.043*\"Figure\" + 0.035*\"decreased\" + 0.032*\"higher\" + 0.031*\"reduced\" + 0.017*\"concentration\" + 0.017*\"amount\"'),\n", + " (24,\n", + " u'0.105*\"Our\" + 0.046*\"cell\" + 0.046*\"data\" + 0.029*\"Together\" + 0.028*\"development\" + 0.026*\"suggest\" + 0.026*\"increases\" + 0.025*\"cycle\" + 0.022*\"mechanisms\" + 0.021*\"results\"'),\n", + " (25,\n", + " u'0.077*\"localization\" + 0.037*\"distribution\" + 0.025*\"kinases\" + 0.024*\"enrichment\" + 0.021*\"subcellular\" + 0.019*\"possibly\" + 0.017*\"summary\" + 0.016*\"In\" + 0.014*\"investigated\" + 0.014*\"consequence\"'),\n", + " (26,\n", + " u'0.102*\"cells\" + 0.087*\"T\" + 0.031*\"Fig\" + 0.026*\"concentrations\" + 0.021*\"relatively\" + 0.019*\"hours\" + 0.018*\"CD4+\" + 0.017*\"cell\" + 0.013*\"days\" + 0.013*\"transient\"'),\n", + " (27,\n", + " u'0.089*\"replication\" + 0.035*\"density\" + 0.034*\"dimer\" + 0.033*\"complete\" + 0.028*\"Figure\" + 0.026*\"enzyme\" + 0.022*\"partial\" + 0.021*\"Hsp27\" + 0.018*\"equivalent\" + 0.017*\"4E\"'),\n", + " (28,\n", + " u'0.063*\">\" + 0.045*\"N\" + 0.032*\"beta\" + 0.027*\"per\" + 0.025*\"probably\" + 0.025*\"Notably\" + 0.023*\"causes\" + 0.023*\"Of\" + 0.022*\"alpha\" + 0.021*\"animals\"'),\n", + " (29,\n", + " u'0.072*\"activation\" + 0.039*\"promoter\" + 0.037*\"activity\" + 0.032*\"stimulation\" + 0.030*\"cells\" + 0.027*\"Fig\" + 0.026*\"expression\" + 0.025*\"induced\" + 0.024*\"inhibited\" + 0.022*\"reporter\"'),\n", + " (30,\n", + " u'0.039*\"c\" + 0.032*\"Fig\" + 0.030*\"cells\" + 0.030*\"E-cadherin\" + 0.025*\"d\" + 0.020*\"lung\" + 0.019*\"vinculin\" + 0.019*\"mM\" + 0.017*\"cytosol\" + 0.017*\"mature\"'),\n", + " (31,\n", + " u'0.055*\"cells\" + 0.047*\"knockdown\" + 0.042*\"resulted\" + 0.038*\"reduction\" + 0.037*\"siRNA\" + 0.035*\"decrease\" + 0.034*\"Fig\" + 0.027*\"caused\" + 0.023*\"impaired\" + 0.020*\"expression\"'),\n", + " (32,\n", + " u'0.042*\"residue\" + 0.032*\"chain\" + 0.031*\"position\" + 0.027*\"identical\" + 0.025*\"appear\" + 0.021*\"side\" + 0.018*\"mutated\" + 0.018*\"In\" + 0.018*\"PD\" + 0.017*\"selective\"'),\n", + " (33,\n", + " u'0.111*\"<\" + 0.090*\"P\" + 0.049*\"neurons\" + 0.037*\"comparison\" + 0.021*\"001\" + 0.021*\"bud\" + 0.018*\"Analysis\" + 0.018*\"0001\" + 0.017*\"48\" + 0.017*\"significant\"'),\n", + " (34,\n", + " u'0.129*\"Figure\" + 0.052*\"4C\" + 0.038*\"5C\" + 0.035*\"4D\" + 0.027*\"MAVS\" + 0.021*\"S2\" + 0.021*\"NSP1\" + 0.016*\"rapid\" + 0.015*\"x\" + 0.014*\"c-Myc\"'),\n", + " (35,\n", + " u'0.130*\"results\" + 0.129*\"These\" + 0.065*\"suggest\" + 0.050*\"indicate\" + 0.044*\"vivo\" + 0.040*\"data\" + 0.031*\"findings\" + 0.020*\"observations\" + 0.018*\"interaction\" + 0.018*\"may\"'),\n", + " (36,\n", + " u'0.070*\"cells\" + 0.066*\"surface\" + 0.051*\"2A\" + 0.045*\"expression\" + 0.043*\"cell\" + 0.040*\"induction\" + 0.026*\"induces\" + 0.026*\"leads\" + 0.022*\"Expression\" + 0.021*\"epithelial\"'),\n", + " (37,\n", + " u'0.100*\"Table\" + 0.099*\"1\" + 0.058*\"Fig\" + 0.032*\"2\" + 0.032*\"S2\" + 0.021*\"supplementary\" + 0.019*\"sensitivity\" + 0.019*\"Information\" + 0.017*\"proteins\" + 0.016*\"online\"'),\n", + " (38,\n", + " u'0.046*\"early\" + 0.042*\"strain\" + 0.030*\"APP\" + 0.029*\"late\" + 0.027*\"endosomes\" + 0.024*\"stage\" + 0.021*\"Data\" + 0.019*\"lysosomal\" + 0.016*\"lysosomes\" + 0.015*\"CMA\"'),\n", + " (39,\n", + " u'0.140*\"effect\" + 0.044*\"without\" + 0.037*\"controls\" + 0.030*\"inhibitory\" + 0.026*\"showing\" + 0.025*\"little\" + 0.022*\"supported\" + 0.022*\"HIV-1\" + 0.020*\"GTP\" + 0.017*\"patterns\"'),\n", + " (40,\n", + " u'0.040*\"signalling\" + 0.034*\"support\" + 0.032*\"synthesis\" + 0.031*\"combination\" + 0.027*\"experimental\" + 0.024*\"physiological\" + 0.022*\"conclusion\" + 0.020*\"test\" + 0.020*\"functionally\" + 0.017*\"To\"'),\n", + " (41,\n", + " u'0.066*\"whether\" + 0.043*\"determined\" + 0.042*\"examined\" + 0.037*\"survival\" + 0.033*\"lack\" + 0.031*\"produced\" + 0.024*\"secretion\" + 0.022*\"around\" + 0.021*\"We\" + 0.021*\"resistance\"'),\n", + " (42,\n", + " u'0.070*\"It\" + 0.047*\"possible\" + 0.038*\"For\" + 0.029*\"blocked\" + 0.025*\"example\" + 0.023*\"yet\" + 0.021*\"foci\" + 0.020*\"approach\" + 0.019*\"AKT\" + 0.018*\"action\"'),\n", + " (43,\n", + " u'0.039*\"12\" + 0.032*\"11\" + 0.030*\"represent\" + 0.020*\"zinc\" + 0.019*\"standard\" + 0.018*\"bars\" + 0.018*\"value\" + 0.017*\"colocalized\" + 0.017*\"genomic\" + 0.016*\"pH\"'),\n", + " (44,\n", + " u'0.214*\"complex\" + 0.091*\"formation\" + 0.019*\"form\" + 0.018*\"component\" + 0.013*\"context\" + 0.013*\"complexes\" + 0.012*\"provided\" + 0.011*\"We\" + 0.010*\"involvement\" + 0.009*\"DNMT1\"'),\n", + " (45,\n", + " u'0.063*\"inhibition\" + 0.049*\"activity\" + 0.047*\"enhanced\" + 0.044*\"due\" + 0.039*\"migration\" + 0.037*\"mediated\" + 0.037*\"dependent\" + 0.036*\"via\" + 0.029*\"Importantly\" + 0.027*\"transcriptional\"'),\n", + " (46,\n", + " u'0.126*\"mice\" + 0.077*\"conditions\" + 0.036*\"proliferation\" + 0.031*\"cells\" + 0.025*\"vector\" + 0.021*\"find\" + 0.015*\"cultured\" + 0.014*\"expression\" + 0.013*\"control\" + 0.012*\"transgenic\"'),\n", + " (47,\n", + " u'0.043*\"membranes\" + 0.035*\"dissociation\" + 0.031*\"glucose\" + 0.023*\"22\" + 0.021*\"constitutive\" + 0.021*\"mm\" + 0.021*\"labeling\" + 0.020*\"distal\" + 0.017*\"80\" + 0.016*\"chromosomes\"'),\n", + " (48,\n", + " u'0.076*\"kinase\" + 0.075*\"signal\" + 0.046*\"pathways\" + 0.033*\"The\" + 0.030*\"length\" + 0.021*\"genetic\" + 0.019*\"extracellular\" + 0.019*\"capable\" + 0.018*\"full\" + 0.017*\"activity\"'),\n", + " (49,\n", + " u'0.081*\"phenotype\" + 0.044*\"samples\" + 0.043*\"2D\" + 0.024*\"rescue\" + 0.022*\"nuclei\" + 0.022*\"noted\" + 0.013*\"significant\" + 0.013*\"statistically\" + 0.013*\"The\" + 0.013*\"maintenance\"'),\n", + " (50,\n", + " u'0.080*\"molecular\" + 0.041*\"protein\" + 0.036*\"mass\" + 0.031*\"SIRT1\" + 0.024*\"impact\" + 0.023*\"weight\" + 0.018*\"predicted\" + 0.017*\"extensive\" + 0.015*\"The\" + 0.015*\"1E\"'),\n", + " (51,\n", + " u'0.034*\"translation\" + 0.030*\"clusters\" + 0.026*\"MuSK\" + 0.025*\"cluster\" + 0.024*\"DDX3\" + 0.022*\"CENP-E\" + 0.020*\"Rab32\" + 0.019*\"hBUBR1\" + 0.017*\"PABP\" + 0.016*\"Rab8\"'),\n", + " (52,\n", + " u'0.087*\"Fig\" + 0.059*\"C\" + 0.053*\"D\" + 0.037*\"E\" + 0.028*\"network\" + 0.027*\"F\" + 0.026*\"stress\" + 0.019*\"B\" + 0.015*\"1\" + 0.014*\"F-actin\"'),\n", + " (53,\n", + " u'0.043*\"least\" + 0.036*\"molecules\" + 0.033*\"two\" + 0.032*\"group\" + 0.030*\"interacted\" + 0.029*\"one\" + 0.027*\"proteins\" + 0.022*\"tail\" + 0.021*\"peak\" + 0.017*\"groups\"'),\n", + " (54,\n", + " u'0.055*\"vitro\" + 0.054*\"phosphorylated\" + 0.048*\"assays\" + 0.043*\"assay\" + 0.031*\"purified\" + 0.030*\"binding\" + 0.025*\"affinity\" + 0.022*\"bound\" + 0.018*\"Fig\" + 0.016*\"interaction\"'),\n", + " (55,\n", + " u'0.086*\"transfected\" + 0.074*\"cells\" + 0.037*\"constructs\" + 0.035*\"stable\" + 0.018*\"transfection\" + 0.017*\"expressing\" + 0.016*\"HEK293\" + 0.016*\"capacity\" + 0.015*\"VEGF\" + 0.014*\"plasmids\"'),\n", + " (56,\n", + " u'0.078*\"degradation\" + 0.062*\"RNA\" + 0.056*\"3B\" + 0.054*\"therefore\" + 0.032*\"appears\" + 0.029*\"along\" + 0.028*\"targets\" + 0.028*\"On\" + 0.023*\"hand\" + 0.022*\"promotes\"'),\n", + " (57,\n", + " u'0.087*\"study\" + 0.043*\"In\" + 0.041*\"potential\" + 0.041*\"novel\" + 0.026*\"present\" + 0.024*\"new\" + 0.023*\"presented\" + 0.022*\"Here\" + 0.022*\"protein\" + 0.021*\"Myo5p\"'),\n", + " (58,\n", + " u'0.173*\"phosphorylation\" + 0.050*\"sites\" + 0.029*\"kinase\" + 0.028*\"tyrosine\" + 0.028*\"14-3-3\" + 0.024*\"binding\" + 0.024*\"receptors\" + 0.021*\"isoforms\" + 0.019*\"inhibitors\" + 0.013*\"many\"'),\n", + " (59,\n", + " u'0.044*\"ATP\" + 0.035*\"suppression\" + 0.031*\"neither\" + 0.028*\"Supplemental\" + 0.027*\"transmembrane\" + 0.021*\"particularly\" + 0.021*\"gel\" + 0.021*\"allow\" + 0.019*\"frequency\" + 0.018*\"puncta\"'),\n", + " (60,\n", + " u'0.265*\":\" + 0.026*\"available\" + 0.024*\"disease\" + 0.015*\"genome\" + 0.015*\"detection\" + 0.014*\"Arabidopsis\" + 0.011*\"information\" + 0.010*\"closely\" + 0.009*\"human\" + 0.009*\"PPI\"'),\n", + " (61,\n", + " u'0.049*\"sequence\" + 0.035*\"targeting\" + 0.031*\"sequences\" + 0.029*\"b\" + 0.025*\"mitochondrial\" + 0.024*\"related\" + 0.023*\"species\" + 0.020*\"protein\" + 0.019*\"S\" + 0.018*\"degree\"'),\n", + " (62,\n", + " u'0.053*\"total\" + 0.046*\"kDa\" + 0.033*\"Both\" + 0.029*\"Gin4\" + 0.026*\"larger\" + 0.022*\"intensity\" + 0.022*\"rRNA\" + 0.020*\"values\" + 0.018*\"NOD2\" + 0.016*\"Drosophila\"'),\n", + " (63,\n", + " u'0.084*\"DNA\" + 0.072*\"transcription\" + 0.046*\"factors\" + 0.030*\"chromatin\" + 0.025*\"acetylation\" + 0.025*\"binding\" + 0.021*\"promoter\" + 0.020*\"set\" + 0.018*\"activity\" + 0.017*\"factor\"'),\n", + " (64,\n", + " u'0.064*\"I\" + 0.043*\"2C\" + 0.038*\"II\" + 0.035*\"p53\" + 0.033*\"FAK\" + 0.026*\"Pol\" + 0.025*\"RNAi\" + 0.024*\"Akt\" + 0.021*\"REG\" + 0.017*\"E2F1\"'),\n", + " (65,\n", + " u'0.059*\"alone\" + 0.043*\"fusion\" + 0.034*\"construct\" + 0.030*\"Fig\" + 0.030*\"recombinant\" + 0.027*\"protein\" + 0.021*\"soluble\" + 0.020*\"GST\" + 0.019*\"Src\" + 0.018*\"G2019S\"'),\n", + " (66,\n", + " u'0.060*\"changes\" + 0.050*\"change\" + 0.029*\"death\" + 0.024*\"enriched\" + 0.020*\"characterized\" + 0.018*\"conformational\" + 0.018*\"binding\" + 0.016*\"The\" + 0.016*\"neuronal\" + 0.016*\"adjacent\"'),\n", + " (67,\n", + " u'0.061*\"EGFR\" + 0.035*\"endocytosis\" + 0.030*\"There\" + 0.026*\"molecule\" + 0.023*\"occur\" + 0.020*\"eg\" + 0.020*\"chains\" + 0.019*\"modification\" + 0.019*\"EGF\" + 0.018*\"contacts\"'),\n", + " (68,\n", + " u'0.047*\"fraction\" + 0.044*\"Fig\" + 0.042*\"antibodies\" + 0.038*\"cells\" + 0.038*\"5B\" + 0.037*\"fractions\" + 0.028*\"performed\" + 0.022*\"Western\" + 0.021*\"blotting\" + 0.017*\"immunoprecipitated\"'),\n", + " (69,\n", + " u'0.038*\"By\" + 0.033*\"OPTN\" + 0.033*\"smaller\" + 0.029*\"mitosis\" + 0.029*\"overexpressing\" + 0.022*\"contrast\" + 0.020*\"instead\" + 0.018*\"portion\" + 0.016*\"ATRX\" + 0.016*\"resistant\"'),\n", + " (70,\n", + " u'0.158*\"%\" + 0.132*\"-\" + 0.058*\"=\" + 0.045*\"+\" + 0.033*\"10\" + 0.027*\"min\" + 0.023*\"respectively\" + 0.019*\"viral\" + 0.016*\"n\" + 0.015*\"50\"'),\n", + " (71,\n", + " u'0.089*\"active\" + 0.066*\"3A\" + 0.037*\"close\" + 0.029*\"proximity\" + 0.026*\"combined\" + 0.024*\"established\" + 0.020*\"linked\" + 0.020*\"readily\" + 0.015*\"form\" + 0.014*\"viruses\"'),\n", + " (72,\n", + " u'0.177*\"LRRK2\" + 0.035*\"tumors\" + 0.026*\"aggregates\" + 0.021*\"clones\" + 0.020*\"microscopy\" + 0.018*\"CD\" + 0.018*\"Based\" + 0.014*\"box\" + 0.014*\"ArfGAP1\" + 0.012*\"We\"'),\n", + " (73,\n", + " u'0.047*\"upstream\" + 0.039*\"tissue\" + 0.034*\"demonstrating\" + 0.031*\"correlated\" + 0.027*\"apical\" + 0.022*\"associates\" + 0.019*\"TLP\" + 0.018*\"TFIIA\" + 0.015*\"peripheral\" + 0.015*\"networks\"'),\n", + " (74,\n", + " u'0.055*\"yeast\" + 0.039*\"previously\" + 0.035*\"interaction\" + 0.035*\"system\" + 0.032*\"clearly\" + 0.030*\"reported\" + 0.027*\"demonstrate\" + 0.019*\"described\" + 0.019*\"binding\" + 0.018*\"proteins\"'),\n", + " (75,\n", + " u'0.114*\"Fig\" + 0.065*\"3\" + 0.057*\"4\" + 0.040*\"5\" + 0.040*\"B\" + 0.039*\"A\" + 0.037*\"nuclear\" + 0.037*\"6\" + 0.036*\"2\" + 0.030*\"Figure\"'),\n", + " (76,\n", + " u'0.107*\"Figure\" + 0.055*\"5A\" + 0.039*\"efficiently\" + 0.026*\"GC\" + 0.023*\"plants\" + 0.016*\"turn\" + 0.015*\"preferentially\" + 0.014*\"COP\" + 0.014*\"S7\" + 0.013*\"TMZ\"'),\n", + " (77,\n", + " u'0.024*\"transport\" + 0.023*\"like\" + 0.022*\"screen\" + 0.020*\"recovered\" + 0.020*\"minor\" + 0.018*\"INCENP\" + 0.017*\"Roquin\" + 0.016*\"First\" + 0.015*\"arrows\" + 0.013*\"maturation\"'),\n", + " (78,\n", + " u'0.069*\"growth\" + 0.066*\"cells\" + 0.058*\"time\" + 0.053*\"h\" + 0.043*\"cell\" + 0.037*\"treatment\" + 0.035*\"apoptosis\" + 0.025*\"24\" + 0.022*\"rate\" + 0.019*\"At\"'),\n", + " (79,\n", + " u'0.083*\"amino\" + 0.051*\"target\" + 0.051*\"acid\" + 0.045*\"acids\" + 0.030*\"individual\" + 0.027*\"promote\" + 0.027*\"1C\" + 0.022*\"immune\" + 0.021*\"ubiquitin\" + 0.013*\"over-expression\"'),\n", + " (80,\n", + " u'0.096*\"three\" + 0.053*\"independent\" + 0.039*\"four\" + 0.033*\"proteins\" + 0.028*\"All\" + 0.026*\"almost\" + 0.023*\"block\" + 0.019*\"experiments\" + 0.018*\"general\" + 0.017*\"inactive\"'),\n", + " (81,\n", + " u'0.062*\"4B\" + 0.048*\"Figure\" + 0.042*\"interacting\" + 0.036*\"appeared\" + 0.031*\"START\" + 0.024*\"Hsp90\" + 0.020*\"proteins\" + 0.018*\"The\" + 0.017*\"light\" + 0.016*\"5E\"'),\n", + " (82,\n", + " u'0.087*\"mutation\" + 0.048*\"completely\" + 0.043*\"Figure\" + 0.034*\"S5\" + 0.034*\"abolished\" + 0.031*\"inhibits\" + 0.031*\"binding\" + 0.025*\"mutations\" + 0.021*\"interaction\" + 0.021*\"integrin\"'),\n", + " (83,\n", + " u'0.063*\"relative\" + 0.057*\"basal\" + 0.037*\"cleavage\" + 0.031*\"nM\" + 0.026*\"mean\" + 0.026*\"phase\" + 0.023*\"cause\" + 0.018*\"sample\" + 0.017*\"M\" + 0.016*\"G1\"'),\n", + " (84,\n", + " u'0.058*\"4A\" + 0.055*\"production\" + 0.054*\"Figure\" + 0.048*\"S3\" + 0.047*\"NS5A\" + 0.027*\"colocalization\" + 0.023*\"observe\" + 0.023*\"cells\" + 0.021*\"despite\" + 0.021*\"PI4KIIIalpha\"'),\n", + " (85,\n", + " u'0.093*\"Figure\" + 0.040*\"6A\" + 0.034*\"HCV\" + 0.033*\"6B\" + 0.026*\"tau\" + 0.026*\"6C\" + 0.026*\"generated\" + 0.026*\"co-localization\" + 0.020*\"confirm\" + 0.019*\"negative\"'),\n", + " (86,\n", + " u'0.041*\"suggests\" + 0.039*\"may\" + 0.033*\"This\" + 0.033*\"cellular\" + 0.031*\"regulation\" + 0.029*\"mechanism\" + 0.027*\"proteins\" + 0.027*\"functions\" + 0.026*\"evidence\" + 0.023*\"involved\"'),\n", + " (87,\n", + " u'0.140*\"gene\" + 0.057*\"mouse\" + 0.054*\"expression\" + 0.036*\"silencing\" + 0.030*\"substrates\" + 0.022*\"LAX\" + 0.022*\"upper\" + 0.021*\"CTLA-4\" + 0.018*\"fibroblasts\" + 0.016*\"open\"'),\n", + " (88,\n", + " u'0.078*\"tumor\" + 0.045*\"factor\" + 0.034*\"mitotic\" + 0.029*\"red\" + 0.028*\"plasmid\" + 0.025*\"growth\" + 0.024*\"cells\" + 0.021*\"effector\" + 0.021*\"green\" + 0.021*\"initiation\"'),\n", + " (89,\n", + " u'0.064*\"deletion\" + 0.043*\"lacking\" + 0.033*\"fragment\" + 0.032*\"Finally\" + 0.030*\"ie\" + 0.021*\"allele\" + 0.021*\"step\" + 0.021*\"subsequent\" + 0.018*\"mutant\" + 0.017*\"lost\"'),\n", + " (90,\n", + " u'0.106*\"structure\" + 0.035*\"among\" + 0.029*\"conclude\" + 0.029*\"located\" + 0.021*\"The\" + 0.020*\"native\" + 0.018*\"chromosome\" + 0.017*\"A\" + 0.016*\"dystrophin\" + 0.012*\"We\"'),\n", + " (91,\n", + " u'0.070*\"pathway\" + 0.067*\"signaling\" + 0.055*\"together\" + 0.031*\"Taken\" + 0.031*\"beta-catenin\" + 0.029*\"activation\" + 0.024*\"downstream\" + 0.019*\"data\" + 0.018*\"regulates\" + 0.016*\"state\"'),\n", + " (92,\n", + " u'0.078*\"et\" + 0.078*\"al\" + 0.037*\"PTEN\" + 0.023*\"targeted\" + 0.021*\"motility\" + 0.014*\"Pan3\" + 0.014*\"&\" + 0.013*\"ERK12\" + 0.011*\"Pan2\" + 0.011*\"also\"'),\n", + " (93,\n", + " u'0.131*\"membrane\" + 0.098*\"Figure\" + 0.053*\"2B\" + 0.040*\"plasma\" + 0.032*\"S4\" + 0.029*\"3D\" + 0.019*\"majority\" + 0.019*\"mainly\" + 0.017*\"culture\" + 0.016*\"Golgi\"'),\n", + " (94,\n", + " u'0.035*\"fully\" + 0.030*\"calreticulin\" + 0.025*\"robust\" + 0.023*\"repair\" + 0.018*\"presumably\" + 0.015*\"c-Fos\" + 0.014*\"TLR9\" + 0.013*\"cisplatin\" + 0.013*\"deficient\" + 0.012*\"hMSH5\"'),\n", + " (95,\n", + " u'0.042*\"human\" + 0.034*\"slightly\" + 0.030*\"background\" + 0.028*\"PDI\" + 0.028*\"confirming\" + 0.028*\"5D\" + 0.027*\"depleted\" + 0.027*\"autophosphorylation\" + 0.025*\"determine\" + 0.021*\"rat\"'),\n", + " (96,\n", + " u'0.049*\"cells\" + 0.043*\"stability\" + 0.028*\"followed\" + 0.026*\"spindle\" + 0.026*\"invasion\" + 0.019*\"variant\" + 0.018*\"DACT1\" + 0.017*\"microtubules\" + 0.017*\"variants\" + 0.015*\"often\"'),\n", + " (97,\n", + " u'0.108*\"cell\" + 0.071*\"cancer\" + 0.056*\"lines\" + 0.042*\"cells\" + 0.040*\"Discussion\" + 0.034*\"breast\" + 0.029*\"line\" + 0.025*\"difference\" + 0.022*\"expression\" + 0.020*\"No\"'),\n", + " (98,\n", + " u'0.366*\"Supplementary\" + 0.038*\"Material\" + 0.038*\"Fig\" + 0.033*\"primary\" + 0.023*\"lipid\" + 0.022*\"Rb\" + 0.017*\"HAUSP\" + 0.016*\"secondary\" + 0.013*\"1b\" + 0.013*\"deficiency\"'),\n", + " (99,\n", + " u'0.154*\"Figure\" + 0.042*\"1B\" + 0.040*\"depletion\" + 0.032*\"GFP\" + 0.027*\"fluorescence\" + 0.023*\"In\" + 0.023*\"7A\" + 0.022*\"1D\" + 0.020*\"7B\" + 0.015*\"mRNAs\"')]" + ] + }, + "execution_count": 56, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "lda.print_topics(100)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/CoSID Gantts .ipynb b/notebooks/CoSID Gantts .ipynb new file mode 100644 index 0000000..a441b98 --- /dev/null +++ b/notebooks/CoSID Gantts .ipynb @@ -0,0 +1,677 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 165, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "
\n", + " \n", + " Loading BokehJS ...\n", + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "application/javascript": [ + "\n", + "(function(global) {\n", + " function now() {\n", + " return new Date();\n", + " }\n", + "\n", + " if (typeof (window._bokeh_onload_callbacks) === \"undefined\") {\n", + " window._bokeh_onload_callbacks = [];\n", + " }\n", + "\n", + " function run_callbacks() {\n", + " window._bokeh_onload_callbacks.forEach(function(callback) { callback() });\n", + " delete window._bokeh_onload_callbacks\n", + " console.info(\"Bokeh: all callbacks have finished\");\n", + " }\n", + "\n", + " function load_libs(js_urls, callback) {\n", + " window._bokeh_onload_callbacks.push(callback);\n", + " if (window._bokeh_is_loading > 0) {\n", + " console.log(\"Bokeh: BokehJS is being loaded, scheduling callback at\", now());\n", + " return null;\n", + " }\n", + " if (js_urls == null || js_urls.length === 0) {\n", + " run_callbacks();\n", + " return null;\n", + " }\n", + " console.log(\"Bokeh: BokehJS not loaded, scheduling load and callback at\", now());\n", + " window._bokeh_is_loading = js_urls.length;\n", + " for (var i = 0; i < js_urls.length; i++) {\n", + " var url = js_urls[i];\n", + " var s = document.createElement('script');\n", + " s.src = url;\n", + " s.async = false;\n", + " s.onreadystatechange = s.onload = function() {\n", + " window._bokeh_is_loading--;\n", + " if (window._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: all BokehJS libraries loaded\");\n", + " run_callbacks()\n", + " }\n", + " };\n", + " s.onerror = function() {\n", + " console.warn(\"failed to load library \" + url);\n", + " };\n", + " console.log(\"Bokeh: injecting script tag for BokehJS library: \", url);\n", + " document.getElementsByTagName(\"head\")[0].appendChild(s);\n", + " }\n", + " };\n", + "\n", + " var js_urls = ['https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.11.1.min.js', 'https://cdn.pydata.org/bokeh/release/bokeh-compiler-0.11.1.min.js'];\n", + "\n", + " var inline_js = [\n", + " function(Bokeh) {\n", + " Bokeh.set_log_level(\"info\");\n", + " },\n", + " \n", + " function(Bokeh) {\n", + " Bokeh.$(\"#a0c72a2f-02d6-4e89-841b-ca1b77a4d633\").text(\"BokehJS successfully loaded\");\n", + " },\n", + " function(Bokeh) {\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-0.11.1.min.css\");\n", + " console.log(\"Bokeh: injecting CSS: https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.11.1.min.css\");\n", + " Bokeh.embed.inject_css(\"https://cdn.pydata.org/bokeh/release/bokeh-widgets-0.11.1.min.css\");\n", + " }\n", + " ];\n", + "\n", + " function run_inline_js() {\n", + " for (var i = 0; i < inline_js.length; i++) {\n", + " inline_js[i](window.Bokeh);\n", + " }\n", + " }\n", + "\n", + " if (window._bokeh_is_loading === 0) {\n", + " console.log(\"Bokeh: BokehJS loaded, going straight to plotting\");\n", + " run_inline_js();\n", + " } else {\n", + " load_libs(js_urls, function() {\n", + " console.log(\"Bokeh: BokehJS plotting callback run at\", now());\n", + " run_inline_js();\n", + " });\n", + " }\n", + "}(this));" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from __future__ import print_function, division\n", + "import numpy as np\n", + "import pandas as pd\n", + "import warnings\n", + "warnings.filterwarnings('ignore')\n", + "import seaborn as sns\n", + "from bokeh.plotting import figure, show, output_notebook, output_file\n", + "from bokeh.models import ColumnDataSource, Range1d\n", + "\n", + "output_notebook()" + ] + }, + { + "cell_type": "code", + "execution_count": 319, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
SentenceIdClause TextCodesExperimentValuesExperimentSpanParagraphHeadingsFloatingBox?Discourse TypefriesSentenceIdfriesEventsIdsfriesEventsDetailsfriesEventText
0s1Mechanisms through which[][][]-NaNFalsenone----
1s1Sos-1 coordinates the activation of Ras and Rac[][][]-NaNFalsenone----
2s2Signaling from receptor tyrosine kinases ( RTK...[][][]p4NaNFalsenone----
3s2requires the sequential activation of the smal...[][][]p4NaNFalsefact----
4s3Son of sevenless ( Sos-1 ) , a bifunctional gu...[][][]p4NaNFalsefact----
\n", + "
" + ], + "text/plain": [ + " SentenceId Clause Text Codes \\\n", + "0 s1 Mechanisms through which [] \n", + "1 s1 Sos-1 coordinates the activation of Ras and Rac [] \n", + "2 s2 Signaling from receptor tyrosine kinases ( RTK... [] \n", + "3 s2 requires the sequential activation of the smal... [] \n", + "4 s3 Son of sevenless ( Sos-1 ) , a bifunctional gu... [] \n", + "\n", + " ExperimentValues ExperimentSpan Paragraph Headings FloatingBox? \\\n", + "0 [] [] - NaN False \n", + "1 [] [] - NaN False \n", + "2 [] [] p4 NaN False \n", + "3 [] [] p4 NaN False \n", + "4 [] [] p4 NaN False \n", + "\n", + " Discourse Type friesSentenceId friesEventsIds friesEventsDetails \\\n", + "0 none - - - \n", + "1 none - - - \n", + "2 none - - - \n", + "3 fact - - - \n", + "4 fact - - - \n", + "\n", + " friesEventText \n", + "0 - \n", + "1 - \n", + "2 - \n", + "3 - \n", + "4 - " + ] + }, + "execution_count": 319, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "tsv = pd.read_csv('/Users/Gully/Documents/Projects/2_active/bigMech/work/2016-07-27-pathwayLogic/tsv1file/PMC2173577.scidp.discourse.tsv',\n", + " sep='\\t')\n", + "pmcId = \"PMC2173577\"\n", + "tsv.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": 323, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import re\n", + "from sets import Set\n", + "\n", + "def read_codes(es):\n", + "\n", + " if( es != es ):\n", + " es = \"[]\"\n", + " \n", + " removeCommaAndRegex = re.compile(r\", and \", re.IGNORECASE)\n", + " es = removeCommaAndRegex.sub(\",\", es)\n", + "\n", + " removeAndRegex = re.compile(r\" and \", re.IGNORECASE)\n", + " es = removeAndRegex.sub(\",\", es)\n", + " \n", + " codes = Set()\n", + "\n", + " for c in re.findall('[Ss]{0,1}\\d+[\\s,]{0,2}[A-Za-z,;\\-\\s]*', es):\n", + "\n", + " #print(c)\n", + " \n", + " simpleM = re.match('\\d+$', c)\n", + " simpleSubM = re.match('\\d+[\\s,]{0,2}([A-Za-z])', c)\n", + " intM = re.match('\\d+[\\s,]{0,2}([A-Za-z]+)\\-([A-Za-z]+)', c)\n", + " comma2M = re.match('\\d+[\\s,]{0,2}([A-Za-z]+)[;,]\\s{0,1}([A-Za-z]+)', c)\n", + " comma3M = re.match('\\d+[\\s,]{0,2}([A-Za-z]+)[;,]\\s{0,1}([A-Za-z]+)[;,]\\s{0,1}([A-Za-z]+)', c)\n", + " \n", + " suppM = re.match('([Ss]){1,1}\\d+', c)\n", + " \n", + " figM = re.match('(\\d+)', c)\n", + " fig = figM.group(1)\n", + " \n", + " if( intM is not None ):\n", + " start = ord(intM.group(1))\n", + " end = ord(intM.group(2))\n", + " for ascii_code in range(start, end+1): \n", + " codes.add(fig + chr(ascii_code))\n", + " #print(\" int:\" + fig + chr(ascii_code))\n", + " \n", + " elif( comma3M is not None ):\n", + " codes.add(fig + comma3M.group(1))\n", + " codes.add(fig + comma3M.group(2))\n", + " codes.add(fig + comma3M.group(3))\n", + " #print(\" comma3:\" + fig + comma3M.group(1))\n", + " #print(\" comma3:\" + fig + comma3M.group(2))\n", + " #print(\" comma3:\" + fig + comma3M.group(3))\n", + " \n", + " elif( comma2M is not None ):\n", + " codes.add(fig + comma2M.group(1))\n", + " codes.add(fig + comma2M.group(2))\n", + " #print(\" comma2:\" + fig + comma2M.group(1))\n", + " #print(\" comma2:\" + fig + comma2M.group(2))\n", + " \n", + " elif( simpleM is not None ):\n", + " codes.add(fig)\n", + " #print(\" simple:\" + fig)\n", + " \n", + " elif( simpleSubM is not None ):\n", + " codes.add(fig + simpleSubM.group(1))\n", + " #print(\" simpleSub:\" + fig + simpleSubM.group(1))\n", + " \n", + " return codes" + ] + }, + { + "cell_type": "code", + "execution_count": 358, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " expt clause_id discourse_type heading expt_id color\n", + "0 1A 58 method Results 0 LightGray\n", + "1 1A 59 method Results 0 LightGray\n", + "2 1A 60 method Results 0 LightGray\n", + "3 1A 61 method Results 0 LightGray\n", + "4 1B 62 result Results 1 Thistle\n", + "5 1C 63 goal Results 2 LightGray\n", + "6 1C 64 method Results 2 LightGray\n", + "7 1C 65 method Results 2 LightGray\n", + "8 1C 66 fact Results 2 Snow\n", + "9 1C 67 fact Results 2 Snow\n", + "10 1C 68 result Results 2 Thistle\n", + "11 1C 69 method Results 2 LightGray\n", + "12 1C 70 result Results 2 Thistle\n", + "13 1C 71 result Results 2 Thistle\n", + "14 1C 72 result Results 2 Thistle\n", + "15 1C 73 result Results 2 Thistle\n", + "16 2A 91 goal Results 3 LightGray\n", + "17 2A 92 method Results 3 LightGray\n", + "18 2A 93 method Results 3 LightGray\n", + "19 2A 94 result Results 3 Thistle\n", + "20 2A 95 method Results 3 LightGray\n", + "21 2A 96 implication Results 3 Plum\n", + "22 2D 99 hypothesis Results 6 Snow\n", + "23 3B 99 hypothesis Results 8 Snow\n", + "24 3A 99 hypothesis Results 7 Snow\n", + "25 2B 99 hypothesis Results 4 Snow\n", + "26 2C 99 hypothesis Results 5 Snow\n", + "30 2B 100 method Results 4 LightGray\n", + "29 3A 100 method Results 7 LightGray\n", + "31 2C 100 method Results 5 LightGray\n", + ".. ... ... ... ... ... ...\n", + "100 7C 184 hypothesis Results 15 Snow\n", + "101 7C 185 hypothesis Results 15 Snow\n", + "102 7C 186 fact Results 15 Snow\n", + "103 7C 187 fact Results 15 Snow\n", + "104 7C 188 hypothesis Results 15 Snow\n", + "105 7C 189 fact Results 15 Snow\n", + "106 7C 190 method Results 15 LightGray\n", + "107 7C 191 method Results 15 LightGray\n", + "108 7C 192 method Results 15 LightGray\n", + "109 7C 193 result Results 15 Thistle\n", + "110 7C 194 implication Results 15 Plum\n", + "111 7C 195 result Results 15 Thistle\n", + "112 7C 196 result Results 15 Thistle\n", + "113 7C 197 implication Results 15 Plum\n", + "114 7C 198 implication Results 15 Plum\n", + "115 8B 202 hypothesis Results 17 Snow\n", + "116 8A 202 hypothesis Results 16 Snow\n", + "117 8B 203 hypothesis Results 17 Snow\n", + "118 8A 203 hypothesis Results 16 Snow\n", + "119 8B 204 hypothesis Results 17 Snow\n", + "120 8A 204 hypothesis Results 16 Snow\n", + "121 8B 205 result Results 17 Thistle\n", + "122 8A 205 result Results 16 Thistle\n", + "123 8B 206 result Results 17 Thistle\n", + "124 8A 206 result Results 16 Thistle\n", + "125 8A 209 result Results 16 Thistle\n", + "126 8C 210 result Results 18 Thistle\n", + "127 8D 214 method Results 19 LightGray\n", + "128 8D 215 result Results 19 Thistle\n", + "129 8D 216 result Results 19 Thistle\n", + "\n", + "[130 rows x 6 columns]\n" + ] + } + ], + "source": [ + "gantt_rows = []\n", + "gantt2_rows = []\n", + "\n", + "dtypes = [\"fact\",\"hypothesis\",\"problem\",\"goal\" ,\"method\",\"result\",\"implication\"]\n", + "colors = [\"Snow\" ,\"Snow\" ,\"Snow\" ,\"LightGray\",\"LightGray\" ,\"Thistle\" ,\"Plum\"] \n", + "colors_s = pd.Series(colors, index=dtypes)\n", + "\n", + "all_codes = Set() \n", + "\n", + "clause_max = -1\n", + "clause_min = 1000\n", + "\n", + "for i,row in tsv.iterrows():\n", + " es = row['ExperimentValues']\n", + " exptSpan = row['ExperimentSpan']\n", + " dt = row['Discourse Type']\n", + " sid = row['SentenceId']\n", + " paragraph = row['Paragraph']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + "\n", + " if( heading != heading ):\n", + " heading = \"\"\n", + "\n", + " #if(not floatingBox):\n", + " # clause_max = i\n", + " \n", + " if( re.match('^Result', heading) is None or floatingBox):\n", + " continue\n", + " \n", + " if( i > clause_max):\n", + " clause_max = i\n", + " if( i < clause_min):\n", + " clause_min = i\n", + " \n", + " codes = read_codes(es) \n", + " for c in codes:\n", + " gantt_rows.append([c, i, dt, heading])\n", + " all_codes.add(c)\n", + " \n", + " spanCodes = read_codes(exptSpan)\n", + " for c in spanCodes:\n", + " gantt2_rows.append([c, i, dt, heading])\n", + " \n", + "codes_s = pd.Series(range(len(all_codes)), index=sorted(list(all_codes)))\n", + "\n", + "gantt_df = pd.DataFrame.from_records(gantt_rows, columns=['expt','clause_id','discourse_type', 'heading']) \n", + "gantt_df = gantt_df.sort(columns=['clause_id'], ascending=True)\n", + "\n", + "gantt2_df = pd.DataFrame.from_records(gantt2_rows, columns=['expt','clause_id','discourse_type', 'heading']) \n", + "gantt2_df = gantt2_df.sort(columns=['clause_id'], ascending=True)\n", + "\n", + "#print(codes_s.loc[gantt_df['expt'].tolist()].tolist())\n", + "\n", + "gantt_df['expt_id'] = codes_s.loc[gantt_df['expt'].tolist()].tolist()\n", + "gantt2_df['expt_id'] = codes_s.loc[gantt2_df['expt'].tolist()].tolist()\n", + "\n", + "gantt2_df['color'] = colors_s.loc[gantt2_df['discourse_type'].tolist()].tolist()\n", + "\n", + "print(gantt2_df)" + ] + }, + { + "cell_type": "code", + "execution_count": 359, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "\n", + "\n", + "
\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "

<Bokeh Notebook handle for In[359]>

" + ], + "text/plain": [ + "" + ] + }, + "execution_count": 359, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "G=figure(title=pmcId, width=800, height=600, \n", + " x_range=Range1d(clause_min, clause_max), y_range=list(codes_s.index.values))\n", + "G.xaxis.axis_label=\"Clause #\"\n", + "G.yaxis.axis_label=\"Figure Code\"\n", + "\n", + "gantt2_df['top']=gantt2_df['expt_id']+0.8\n", + "gantt2_df['bottom']=gantt2_df['expt_id']+1.2\n", + "gantt2_df['left']=gantt2_df['clause_id']-0.5\n", + "gantt2_df['right']=gantt2_df['clause_id']+0.5\n", + "\n", + "cds2 = ColumnDataSource(gantt2_df)\n", + "G.quad(left='left', right='right', bottom='bottom', top='top',source=cds2, line_color=\"gray\", color='color')\n", + "\n", + "cds = ColumnDataSource(gantt_df)\n", + "G.scatter('clause_id', 'expt', source=cds, marker='x', size=15,\n", + " line_color=\"red\", fill_color=\"red4\")\n", + "\n", + "#G.rect(,\"Item\",source=CDS)\n", + "show(G)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/DySE Model Elements.ipynb b/notebooks/DySE Model Elements.ipynb new file mode 100644 index 0000000..f3a2e3c --- /dev/null +++ b/notebooks/DySE Model Elements.ipynb @@ -0,0 +1,39 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append(\"..\")\n", + "from robot_biocurator.reactome import *\n", + "import pandas as pd" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python [Root]", + "language": "python", + "name": "Python [Root]" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/INTACT Processing Scripts.ipynb b/notebooks/INTACT Processing Scripts.ipynb new file mode 100644 index 0000000..6935791 --- /dev/null +++ b/notebooks/INTACT Processing Scripts.ipynb @@ -0,0 +1,3021 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Load a list of open access PMID files." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\n", + "from subprocess import call\n", + "\n", + "pmids = []\n", + "pmid_file = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pmids.txt\"\n", + "\n", + "with open(pmid_file) as f:\n", + " for line in f.readlines():\n", + " pmids.append(line.strip())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Shell commands to build the zipped bundle. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# echo -n application/vnd.wf4ever.robundle+zip > mimetype\n", + "# zip -0 -X ../reach mimetype\n", + "# zip -X -r ../reach . -x mimetype\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Convert the sentences from each paper processed by SciDT into simple sentences for each Figure assignment." + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "def retrieve_sentences_for_modeling(inFile, fid):\n", + " \n", + " tsv = pd.read_csv(inFile, sep='\\t')\n", + " fig_tagged_sentences = {}\n", + "\n", + " for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + " fig = row['Figure Assignment']\n", + " offset_start = row['Offset_Begin']\n", + " offset_end = row['Offset_End']\n", + " \n", + " if fig == fig:\n", + " for f in fig.split('|'):\n", + " if( fig_tagged_sentences.get(f, None) is None ):\n", + " sent_list = []\n", + " fig_tagged_sentences[f] = sent_list\n", + " sent_list.append({'sid': sid, 'pid':paragraph, \n", + " 'start': offset_start, 'end': offset_end, 'text': text,\n", + " 'discourse_types': discourse})\n", + " else:\n", + " sent_list = fig_tagged_sentences[f]\n", + " sent_list.append({'sid': sid, 'pid':paragraph, \n", + " 'start': offset_start, 'end': offset_end, 'text': text,\n", + " 'discourse_types': discourse})\n", + " \n", + " \n", + " return fig_tagged_sentences" + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "10087260\n", + "10087263\n", + "10087265\n", + "10209036\n", + "10225955\n", + "10366597\n", + "10366599\n", + "10385523\n", + "10385526\n", + "10402465\n", + "10429675\n", + "10545507\n", + "10562275\n", + "10562277\n", + "10562279\n", + "10562288\n", + "10601328\n", + "10601346\n", + "10613896\n", + "10620603\n", + "10629222\n", + "10648568\n", + "10662770\n", + "10684247\n", + "10704439\n", + "10704444\n", + "10704446\n", + "10725331\n", + "10725334\n", + "10747088\n", + "10747089\n", + "10790433\n", + "10811823\n", + "10831611\n", + "10859335\n", + "10864201\n", + "10871282\n", + "10900456\n", + "10931856\n", + "10931877\n", + "10953014\n", + "10974003\n", + "10995436\n", + "11018051\n", + "11018064\n", + "11034606\n", + "11038172\n", + "11038182\n", + "11076969\n", + "11086001\n", + "11134073\n", + "11149930\n", + "11157975\n", + "11157979\n", + "11157984\n", + "11181702\n", + "11257119\n", + "11266443\n", + "11266449\n", + "11266451\n", + "11309418\n", + "11401320\n", + "11402059\n", + "11448995\n", + "11502761\n", + "11514608\n", + "11564755\n", + "11570975\n", + "11571312\n", + "11591728\n", + "11591731\n", + "11684708\n", + "11724822\n", + "11739401\n", + "11739402\n", + "11739404\n", + "11747467\n", + "11756480\n", + "11777938\n", + "11846885\n", + "11877480\n", + "11914126\n", + "11916981\n", + "11927603\n", + "11927608\n", + "12011112\n", + "12147674\n", + "12167173\n", + "12199906\n", + "12421467\n", + "12446742\n", + "12473693\n", + "12486103\n", + "12486115\n", + "12507995\n", + "12527750\n", + "12566426\n", + "12642614\n", + "12682088\n", + "12689351\n", + "12719471\n", + "12771128\n", + "12782684\n", + "12847081\n", + "12900395\n", + "12939254\n", + "14517205\n", + "14568990\n", + "14612908\n", + "14638857\n", + "14707117\n", + "14709540\n", + "14734533\n", + "14737190\n", + "14970179\n", + "15037601\n", + "15045029\n", + "15051809\n", + "15070402\n", + "15078903\n", + "15096524\n", + "15148308\n", + "15159416\n", + "15302858\n", + "15314064\n", + "15326198\n", + "1541635\n", + "15477347\n", + "15504911\n", + "15575970\n", + "15583694\n", + "15642746\n", + "15653635\n", + "15720729\n", + "15767459\n", + "15796781\n", + "15828860\n", + "15883195\n", + "15928207\n", + "15967037\n", + "16000169\n", + "16027220\n", + "16043514\n", + "16043515\n", + "16061695\n", + "16087707\n", + "16098226\n", + "16115959\n", + "16166655\n", + "16179646\n", + "16179649\n", + "16203867\n", + "16254079\n", + "16301747\n", + "16356270\n", + "16396833\n", + "16396834\n", + "16403219\n", + "16415179\n", + "16449187\n", + "16492808\n", + "16513846\n", + "16520382\n", + "16545136\n", + "16603075\n", + "16606443\n", + "16618814\n", + "16636147\n", + "16638120\n", + "16672054\n", + "16717130\n", + "16729043\n", + "16754960\n", + "16839418\n", + "16847100\n", + "16872538\n", + "16880273\n", + "16893970\n", + "16923827\n", + "16945160\n", + "16982639\n", + "16990252\n", + "17000877\n", + "17030985\n", + "17085477\n", + "17112379\n", + "17151076\n", + "17183697\n", + "17224084\n", + "17280616\n", + "17284314\n", + "17341466\n", + "17353368\n", + "17353931\n", + "17407569\n", + "17412707\n", + "17470632\n", + "17485491\n", + "17485524\n", + "17500595\n", + "17511879\n", + "17543119\n", + "17557078\n", + "17581628\n", + "17591856\n", + "17605817\n", + "17608567\n", + "17612402\n", + "17620405\n", + "17620407\n", + "17623094\n", + "17627824\n", + "17650322\n", + "17660750\n", + "17660751\n", + "17667950\n", + "17690686\n", + "17721441\n", + "17724128\n", + "17762866\n", + "17889823\n", + "17936057\n", + "17937504\n", + "17948060\n", + "17986458\n", + "18000013\n", + "18034155\n", + "18086859\n", + "18154663\n", + "18157088\n", + "18171471\n", + "18188153\n", + "18188154\n", + "18208323\n", + "18226242\n", + "18239682\n", + "18256700\n", + "18266467\n", + "18268103\n", + "18286207\n", + "18289379\n", + "18292755\n", + "18301737\n", + "18309292\n", + "18309293\n", + "18309296\n", + "18320063\n", + "18354501\n", + "18377662\n", + "18388858\n", + "18394558\n", + "18412956\n", + "18421166\n", + "18430226\n", + "18433452\n", + "18435708\n", + "18447585\n", + "18452624\n", + "18458160\n", + "18466225\n", + "18479511\n", + "18497748\n", + "18498651\n", + "18498752\n", + "18509523\n", + "18511940\n", + "18518979\n", + "18551167\n", + "18560762\n", + "18573912\n", + "18583960\n", + "18583988\n", + "18586827\n", + "18604270\n", + "18612383\n", + "18617507\n", + "18628297\n", + "18628823\n", + "18629017\n", + "18631241\n", + "18647389\n", + "18662404\n", + "18663010\n", + "18665261\n", + "18671868\n", + "18682833\n", + "18703495\n", + "18724936\n", + "18758438\n", + "18761697\n", + "18762578\n", + "18762581\n", + "18775314\n", + "18775702\n", + "18779372\n", + "18781224\n", + "18794331\n", + "18800055\n", + "18802460\n", + "18808384\n", + "18812399\n", + "18818696\n", + "18833289\n", + "18836139\n", + "18923419\n", + "18927618\n", + "18946488\n", + "18953286\n", + "18955484\n", + "18985028\n", + "19008859\n", + "19019158\n", + "19037259\n", + "19055777\n", + "19060904\n", + "19063885\n", + "19079254\n", + "19088080\n", + "19088272\n", + "19107194\n", + "19107203\n", + "19114595\n", + "19118384\n", + "19131970\n", + "19135240\n", + "19150989\n", + "19153600\n", + "19155274\n", + "19156129\n", + "19158676\n", + "19165350\n", + "19167335\n", + "19171758\n", + "19214185\n", + "19229298\n", + "19277118\n", + "19290556\n", + "19322195\n", + "19322197\n", + "19329994\n", + "19360002\n", + "19369943\n", + "19407811\n", + "19419567\n", + "19432797\n", + "19440292\n", + "19440376\n", + "19455133\n", + "19486527\n", + "19494831\n", + "19498462\n", + "19498465\n", + "19506933\n", + "19520861\n", + "19521502\n", + "19523115\n", + "19536134\n", + "19536198\n", + "19543227\n", + "19567478\n", + "19570034\n", + "19570982\n", + "19575010\n", + "19590579\n", + "19609305\n", + "19616007\n", + "19619546\n", + "19625296\n", + "19629177\n", + "19635168\n", + "19636380\n", + "19648646\n", + "19680228\n", + "19680552\n", + "19682256\n", + "19690564\n", + "19696784\n", + "19701182\n", + "19701191\n", + "19704022\n", + "19730435\n", + "19730696\n", + "19736317\n", + "19746159\n", + "19763081\n", + "19765300\n", + "19767740\n", + "19781631\n", + "19797078\n", + "19798056\n", + "19798101\n", + "19807924\n", + "19841731\n", + "19874541\n", + "19887001\n", + "19888460\n", + "19888464\n", + "19893485\n", + "19893486\n", + "19893489\n", + "19893491\n", + "19903340\n", + "19927124\n", + "19933256\n", + "19933576\n", + "19934257\n", + "19934264\n", + "19940019\n", + "19941819\n", + "19941825\n", + "19942852\n", + "19959993\n", + "19959995\n", + "19996314\n", + "20007317\n", + "20037628\n", + "20043912\n", + "20071408\n", + "20075079\n", + "20075868\n", + "20094031\n", + "20098747\n", + "20110348\n", + "20111005\n", + "20123736\n", + "20129940\n", + "20140193\n", + "20141835\n", + "20169075\n", + "20169078\n", + "20169165\n", + "20174651\n", + "20178605\n", + "20186120\n", + "20205919\n", + "20211136\n", + "20214800\n", + "20224550\n", + "20231380\n", + "20300060\n", + "20305656\n", + "20308429\n", + "20338032\n", + "20353594\n", + "20360680\n", + "20362541\n", + "20362542\n", + "20368803\n", + "20371544\n", + "20375098\n", + "20388642\n", + "20399778\n", + "20400938\n", + "20410134\n", + "20418871\n", + "20418951\n", + "20434988\n", + "20436455\n", + "20439537\n", + "20453830\n", + "20456499\n", + "20467438\n", + "20471980\n", + "20512112\n", + "20529865\n", + "20540776\n", + "20543819\n", + "20559324\n", + "20561531\n", + "20562859\n", + "20574810\n", + "20579338\n", + "20581830\n", + "20584916\n", + "20594350\n", + "20601937\n", + "20603002\n", + "20603614\n", + "20624308\n", + "20628654\n", + "20639901\n", + "20639902\n", + "20642453\n", + "20657822\n", + "20659021\n", + "20664520\n", + "20676093\n", + "20676095\n", + "20676135\n", + "20686606\n", + "20693977\n", + "20697347\n", + "20697357\n", + "20700126\n", + "20706207\n", + "20711500\n", + "20729920\n", + "20738866\n", + "20802085\n", + "20802534\n", + "20802536\n", + "20817927\n", + "20818336\n", + "20818435\n", + "20819940\n", + "20840750\n", + "20843328\n", + "20856196\n", + "20856200\n", + "20856870\n", + "20862261\n", + "20865124\n", + "20871633\n", + "20881089\n", + "20890303\n", + "20890305\n", + "20920251\n", + "20924358\n", + "20929568\n", + "20929579\n", + "20932347\n", + "20935634\n", + "20935647\n", + "20935677\n", + "20936779\n", + "20953186\n", + "20969766\n", + "20972225\n", + "20972459\n", + "20976523\n", + "21034468\n", + "21037577\n", + "21047798\n", + "21048921\n", + "21048939\n", + "21057456\n", + "21057510\n", + "21063388\n", + "21078624\n", + "21092281\n", + "21092292\n", + "21110861\n", + "21112398\n", + "21113127\n", + "21114864\n", + "21118991\n", + "21119599\n", + "21119626\n", + "21124868\n", + "21124943\n", + "21131964\n", + "21131965\n", + "21131967\n", + "21132010\n", + "21139566\n", + "21147767\n", + "21148288\n", + "21149568\n", + "21151104\n", + "21157431\n", + "2116421\n", + "21170087\n", + "21172016\n", + "21179004\n", + "21179020\n", + "21179510\n", + "2118142\n", + "21186367\n", + "21187329\n", + "21203429\n", + "21203436\n", + "21209940\n", + "21212461\n", + "2121740\n", + "21217644\n", + "21217774\n", + "21219645\n", + "21220045\n", + "21224849\n", + "21224850\n", + "21242965\n", + "21242966\n", + "21242980\n", + "21245844\n", + "21247419\n", + "21251231\n", + "21252856\n", + "21274006\n", + "21277013\n", + "21278383\n", + "21278420\n", + "21278786\n", + "21288885\n", + "21297662\n", + "21306563\n", + "21311558\n", + "21314951\n", + "21317875\n", + "21328542\n", + "21335238\n", + "21336258\n", + "21338522\n", + "21347350\n", + "21364888\n", + "21386817\n", + "21386897\n", + "21390248\n", + "21399620\n", + "21399639\n", + "21399666\n", + "21407176\n", + "21407215\n", + "21408167\n", + "21415856\n", + "21423209\n", + "21423216\n", + "21427704\n", + "21439629\n", + "21445305\n", + "21447707\n", + "21454693\n", + "21498514\n", + "21505799\n", + "21507240\n", + "21516116\n", + "21525870\n", + "21525958\n", + "21526181\n", + "21533037\n", + "21541365\n", + "21556049\n", + "21559518\n", + "21569246\n", + "21575178\n", + "21575199\n", + "21577200\n", + "21586138\n", + "21602887\n", + "21613545\n", + "21625644\n", + "21637789\n", + "21642953\n", + "21643011\n", + "21668996\n", + "21669201\n", + "21679440\n", + "21685908\n", + "21685939\n", + "21685944\n", + "21689417\n", + "21698133\n", + "21701560\n", + "21705390\n", + "21706061\n", + "21718540\n", + "21725360\n", + "21725367\n", + "21734647\n", + "21743437\n", + "21743479\n", + "21743491\n", + "21747946\n", + "21781306\n", + "21798038\n", + "21804533\n", + "21811563\n", + "21822214\n", + "21847096\n", + "21847100\n", + "21857973\n", + "21871133\n", + "21874024\n", + "21875956\n", + "21880142\n", + "21884581\n", + "21890893\n", + "21893585\n", + "21903581\n", + "21908610\n", + "21909133\n", + "21909281\n", + "21931555\n", + "21931591\n", + "21943085\n", + "21946559\n", + "21946560\n", + "21952049\n", + "21964608\n", + "21988832\n", + "21998301\n", + "22010978\n", + "22014111\n", + "22016384\n", + "22022230\n", + "22022540\n", + "22027862\n", + "22028648\n", + "22034500\n", + "22046270\n", + "22048310\n", + "22056778\n", + "22056872\n", + "22057290\n", + "22059385\n", + "22068330\n", + "22072986\n", + "22087277\n", + "22094269\n", + "22096563\n", + "22102817\n", + "22116401\n", + "22118466\n", + "22118625\n", + "22135285\n", + "22157895\n", + "22162999\n", + "22163275\n", + "22174692\n", + "22174833\n", + "22207579\n", + "22238662\n", + "22242148\n", + "22269274\n", + "22270917\n", + "22279592\n", + "22280843\n", + "22291595\n", + "22303461\n", + "22323290\n", + "22323517\n", + "22325148\n", + "22334672\n", + "22355679\n", + "22363216\n", + "22382979\n", + "22387996\n", + "22401567\n", + "22402981\n", + "22404908\n", + "22406378\n", + "22406686\n", + "22413019\n", + "22442151\n", + "22446626\n", + "22447027\n", + "22458338\n", + "22470507\n", + "22471946\n", + "22491013\n", + "22493164\n", + "22493500\n", + "22500027\n", + "22510880\n", + "22510882\n", + "22518138\n", + "22540012\n", + "22555292\n", + "22575651\n", + "22581261\n", + "22609302\n", + "22613832\n", + "22623428\n", + "22634751\n", + "22648170\n", + "22651821\n", + "22662192\n", + "22674187\n", + "22685417\n", + "22731636\n", + "22791023\n", + "22808155\n", + "22829933\n", + "22842785\n", + "22850675\n", + "22863774\n", + "22892566\n", + "22899650\n", + "22904065\n", + "22905162\n", + "22908322\n", + "22916011\n", + "22939623\n", + "22940692\n", + "22952686\n", + "22952718\n", + "22962574\n", + "22962849\n", + "22966907\n", + "22977175\n", + "23022564\n", + "23023393\n", + "23042150\n", + "23056421\n", + "23065768\n", + "23075850\n", + "23082202\n", + "23082758\n", + "23085988\n", + "23086447\n", + "23086448\n", + "23088713\n", + "23104095\n", + "23104097\n", + "23142775\n", + "23143267\n", + "23161686\n", + "23170778\n", + "23183827\n", + "23209657\n", + "23216645\n", + "23217712\n", + "23236467\n", + "23253866\n", + "23263555\n", + "23275563\n", + "23284848\n", + "23316280\n", + "23332754\n", + "23353684\n", + "23353889\n", + "23369005\n", + "23369981\n", + "23395900\n", + "23395907\n", + "23399914\n", + "23403925\n", + "23405092\n", + "23414517\n", + "23431397\n", + "23449449\n", + "23455607\n", + "23463101\n", + "23467085\n", + "23505436\n", + "23511972\n", + "23514585\n", + "23520446\n", + "23533724\n", + "23549287\n", + "23549480\n", + "23555304\n", + "23565095\n", + "23582324\n", + "23582331\n", + "23585889\n", + "23593007\n", + "23621612\n", + "23622247\n", + "23634843\n", + "23650535\n", + "23658700\n", + "23667408\n", + "23675303\n", + "23680104\n", + "23685356\n", + "23693014\n", + "23706742\n", + "23708798\n", + "23725059\n", + "23734815\n", + "23737971\n", + "23741051\n", + "23750211\n", + "23752268\n", + "23758976\n", + "23772379\n", + "23773523\n", + "23782464\n", + "23788678\n", + "23799140\n", + "23799367\n", + "23823123\n", + "23829672\n", + "23840630\n", + "23840749\n", + "23840900\n", + "23855374\n", + "23857585\n", + "23861867\n", + "23866081\n", + "23890821\n", + "23902751\n", + "23907583\n", + "23909438\n", + "23910724\n", + "2391361\n", + "23918937\n", + "23933751\n", + "23935490\n", + "23935497\n", + "23940795\n", + "23948297\n", + "23949442\n", + "23967200\n", + "23979715\n", + "24001151\n", + "24006493\n", + "24009510\n", + "24009866\n", + "24034246\n", + "24035192\n", + "24056303\n", + "24063750\n", + "24065129\n", + "24069158\n", + "24069330\n", + "24069433\n", + "24075010\n", + "24076655\n", + "24076656\n", + "24083380\n", + "24086303\n", + "24090070\n", + "24094005\n", + "24098548\n", + "24113872\n", + "24117850\n", + "24125847\n", + "24145797\n", + "24161670\n", + "24167781\n", + "24176932\n", + "24189400\n", + "24191246\n", + "24223725\n", + "24240174\n", + "24243021\n", + "24244371\n", + "24263861\n", + "24269683\n", + "24274578\n", + "24275654\n", + "24282027\n", + "24286120\n", + "24311597\n", + "24314029\n", + "24330623\n", + "24344185\n", + "24349196\n", + "24349490\n", + "24365180\n", + "24366813\n", + "24374083\n", + "24397932\n", + "24416391\n", + "24434184\n", + "24473148\n", + "24502362\n", + "24515439\n", + "24527098\n", + "24555568\n", + "24561554\n", + "24563863\n", + "24566989\n", + "24568222\n", + "24582333\n", + "24587342\n", + "24610369\n", + "24618038\n", + "24618592\n", + "24626987\n", + "24643253\n", + "24651726\n", + "24656813\n", + "24722491\n", + "24754922\n", + "24798445\n", + "24823443\n", + "24835508\n", + "24843023\n", + "24847877\n", + "24855951\n", + "24872509\n", + "24879895\n", + "24904275\n", + "24914955\n", + "24937146\n", + "24960027\n", + "24960071\n", + "24963139\n", + "24964212\n", + "24983867\n", + "25009464\n", + "25147953\n", + "25159688\n", + "25170085\n", + "25171412\n", + "25225338\n", + "25260594\n", + "25260751\n", + "25277244\n", + "25278935\n", + "25294836\n", + "25294943\n", + "25314077\n", + "25321483\n", + "25360523\n", + "25374563\n", + "25425574\n", + "25445562\n", + "25473596\n", + "25519916\n", + "25533335\n", + "25609649\n", + "25653167\n", + "25697406\n", + "25767811\n", + "7528772\n", + "7561682\n", + "7593161\n", + "7807015\n", + "7844150\n", + "8253836\n", + "8551220\n", + "8609167\n", + "8627166\n", + "8627180\n", + "8666671\n", + "8666672\n", + "8691146\n", + "8691154\n", + "8707857\n", + "8858162\n", + "8922390\n", + "9060478\n", + "9128257\n", + "9151673\n", + "9214383\n", + "9214386\n", + "9334338\n", + "9412461\n", + "9425168\n", + "9472029\n", + "9531549\n", + "9531566\n", + "9628892\n", + "9660868\n", + "9700171\n", + "9763420\n", + "9786960\n", + "9813092\n", + "9817749\n", + "9864353\n", + "9864360\n", + "9922454\n", + "9971739\n" + ] + } + ], + "source": [ + "tsv_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4\"\n", + "sentence_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/fig_sentences\"\n", + "\n", + "for root, dirs, files in os.walk(tsv_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.tsv' :\n", + " pmid = file[:-4]\n", + " if( pmid in pmids ):\n", + " print( pmid )\n", + " fig_tagged_sentences = retrieve_sentences_for_modeling(root+'/'+file, pmid)\n", + " for fig in fig_tagged_sentences.keys():\n", + " out = open(sentence_dir+'/'+pmid+'_'+fig+'.txt', 'w')\n", + " for sent_hash in fig_tagged_sentences[fig]:\n", + " out.write(sent_hash['text'] + '\\n')\n", + " out.close()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Functions to simplify INTACT records from their standard XML into TSV format. " + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from bs4 import BeautifulSoup\n", + "import re\n", + "\n", + "def build_figure_extraction_patterns():\n", + " bf = \"\\s*f(igs|igs\\.|ig|ig\\.|igure|\\.|ig\\:){0,1}\"\n", + " d = \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)\\s*\\.{0,1}\\s*\"\n", + " d_split = \"\\s*(\\d*)\\s*[\\.\\;\\,]{0,1}\\s*([a-z]*)\"\n", + " interval = \"\\s*(\\d+)([a-z]+)\\\\-([a-z]+)\"\n", + " pattHash = {} \n", + " \n", + " figPatt = []\n", + " pattHash['figPatt'] = figPatt\n", + " \n", + " # 0. No alphanumeric codes at all: 'Figure. 1; more text'\n", + " figPatt.append(re.compile(\"^\" + bf + d + \"$\")) \n", + " figPatt.append(re.compile(\"^\" + bf + \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)[\\,\\;\\.]{0,1}\\s*t\"))\n", + " figPatt.append(re.compile(\"^\" + bf + \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)[\\,\\;\\.]{0,1}\\s*s\"))\n", + " figPatt.append(re.compile(\"^\" + bf + \"\\s*(\\d+\\s*[\\.\\;\\,]{0,1}\\s*[a-z]*)[\\,\\;\\.]{0,1}\\s+and\\s+s\"))\n", + " \n", + " # [1]\n", + " simplePatt = re.compile(\"^\" + d + \"$\");\n", + " pattHash['simplePatt'] = simplePatt\n", + " \n", + " # [2,4] \n", + " space2Patt = re.compile(\"^\" + bf + d + \"\\s+\" + bf + d + \"$\");\n", + " pattHash['space2Patt'] = space2Patt\n", + "\n", + " # [2,4,6] \n", + " space3Patt = re.compile(\"^\"+bf+d+\"\\s+\"+bf+d+\"\\s+\"+bf+d+\"$\");\n", + " pattHash['space3Patt'] = space3Patt\n", + "\n", + " # [2,4]\n", + " fullComma2Patt = re.compile(\"^\" + bf + d + \"[\\;\\,]\" + bf + d + \"$\")\n", + " pattHash['fullComma2Patt'] = fullComma2Patt\n", + " \n", + " # [2,3]\n", + " comma2Patt = re.compile(\"^\" + bf + d + \"[\\;\\,]\" + d + \"$\")\n", + " pattHash['comma2Patt'] = comma2Patt\n", + "\n", + " # [1,2]\n", + " simpleComma2Patt = re.compile(\"^\" + d + \"[\\;\\,]\" + d + \"$\")\n", + " pattHash['simpleComma2Patt'] = simpleComma2Patt\n", + "\n", + " # [2,3,4]\n", + " comma3Patt = re.compile(\"^\" + bf + d + \"[\\;\\,]\" + d + \"[\\;\\,]\" + d + \"$\");\n", + " pattHash['comma3Patt'] = comma3Patt\n", + " \n", + " # [1,2,3]\n", + " simpleComma3Patt = re.compile(\"^\" + d + \"[\\;\\,]\" + d + \"[\\;\\,]\" + d + \"$\");\n", + " pattHash['simpleComma3Patt'] = simpleComma3Patt\n", + "\n", + " # [2,3,4,5]\n", + " comma4Patt = re.compile(\"^\"+bf+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"$\");\n", + " pattHash['comma4Patt'] = comma4Patt\n", + "\n", + " # [2,3,4,5,6]\n", + " comma5Patt = re.compile(\"^\"+bf+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"$\");\n", + " pattHash['comma5Patt'] = comma5Patt\n", + "\n", + " # [1,2,3,4]\n", + " simpleComma4Patt = re.compile(\"^\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"[\\;\\,]\"+d+\"$\");\n", + " pattHash['simpleComma4Patt'] = simpleComma4Patt\n", + "\n", + " # [2,3]\n", + " and2Patt = re.compile(\"^\" + bf + d + \"\\s+and\\s+\" + d + \"$\");\n", + " pattHash['and2Patt'] = and2Patt\n", + " \n", + " # [1,2]\n", + " simpleAnd2Patt = re.compile(\"^\" + d + \"\\s+and\\s+\" + d + \"$\");\n", + " pattHash['simpleAnd2Patt'] = simpleAnd2Patt\n", + "\n", + " # [1,2,3]\n", + " simple_a_and_b_patt = re.compile(\"^\" + d_split + \"\\s+and\\s+([a-z])$\");\n", + " pattHash['simple_a_and_b_patt'] = simple_a_and_b_patt\n", + "\n", + " # [2,3,4]\n", + " a_and_b_patt = re.compile(\"^\" + bf + d_split + \"\\s+and\\s+([a-z])$\");\n", + " pattHash['a_and_b_patt'] = a_and_b_patt\n", + "\n", + " # [1,2,3]\n", + " simple_a_comma_b_patt = re.compile(\"^\" + d_split + \"[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['simple_a_comma_b_patt'] = simple_a_comma_b_patt\n", + "\n", + " # [2,3,4]\n", + " a_comma_b_patt = re.compile(\"^\"+bf+d_split+\"[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['a_comma_b_patt'] = a_comma_b_patt\n", + "\n", + " # [1,2,3]\n", + " simple_a_comma_b_comma_c_patt = re.compile(\"^\" + d_split + \"[\\;\\,]\\s*([a-z])\\s*[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['simple_a_comma_b_comma_c_patt'] = simple_a_comma_b_comma_c_patt\n", + "\n", + " # [2,3,4]\n", + " a_comma_b_comma_c_patt = re.compile(\"^\"+bf+d_split+\"[\\;\\,]\\s*([a-z])\\s*[\\;\\,]\\s*([a-z])$\");\n", + " pattHash['a_comma_b_comma_c_patt'] = a_comma_b_comma_c_patt\n", + "\n", + " # [2,3,4,5]\n", + " a_b_and_c_patt = re.compile(\"^\" + bf + d_split + \"[\\;\\,]\\s+([a-z])\\s+and\\s+([a-z])$\");\n", + " pattHash['a_b_and_c_patt'] = a_b_and_c_patt\n", + "\n", + " # [1,2,3,4]\n", + " simple_a_b_and_c_patt = re.compile(\"^\" + d_split + \"[\\;\\,]\\s+([a-z])\\s+and\\s+([a-z])$\");\n", + " pattHash['simple_a_b_and_c_patt'] = simple_a_b_and_c_patt\n", + "\n", + " tableFigPatt = re.compile(\"^t(ab\\.|ab|able){0,1}.*\" + bf + d + \"$\");\n", + " pattHash['tableFigPatt'] = tableFigPatt\n", + "\n", + " intervalPatt = re.compile(\"^\" + bf + interval + \"$\");\n", + " pattHash['intervalPatt'] = intervalPatt\n", + "\n", + " # simple single table (table 1, t1, tab. 1a)\n", + " # returned value is second group\n", + " tablePatt = re.compile(\"^t(ab\\.|ab|able){0,1}\\s*([\\di]+[a-z]{0,1})[\\,\\;\\.]{0,1}$\");\n", + " pattHash['tablePatt'] = tablePatt\n", + "\n", + " # simple single table (table 1, t1, tab. 1a)\n", + " # returned value is third group\n", + " suppTablePatt = re.compile(\"^s(upp|upp.|lementary){0,1}\\s*t(ab\\.|ab|able){0,1}\\s*([i\\d]+[a-z]{0,1})[\\,\\;\\.]{0,1}$\");\n", + " pattHash['suppTablePatt'] = suppTablePatt\n", + " \n", + " return pattHash\n", + "\n", + "def run_simple_matcher(fig_text, patt_hash, patt_code, groups=[1]):\n", + " match = re.search(patt_hash.get(patt_code), fig_text)\n", + " results = []\n", + " if( match ) :\n", + " for g in groups:\n", + " results.append(match.group(g))\n", + " return results\n", + " else:\n", + " return None\n", + "\n", + "def build_matched_string(matched_list,code):\n", + " matched_str = \"\"\n", + " for mf in matched_list:\n", + " if len(matched_str) > 0 :\n", + " matched_str += '|'\n", + " matched_str += code + mf.replace(\" \", \"\").replace(\".\", \"\")\n", + " return matched_str\n", + "\n", + "def run_matcher(fig_text, patt_hash):\n", + " \n", + " if(fig_text == 'nfa' ):\n", + " return None\n", + " \n", + " # strip out all parentheses.\n", + " paren_patt = re.compile(\"(\\(.+?\\))\")\n", + " fig_text = re.sub(paren_patt, \"\", fig_text)\n", + "\n", + " # covert & to 'and'.\n", + " fig_text = fig_text.replace(\"&\", \"and\")\n", + " \n", + " fig_patt = patt_hash.get('figPatt')\n", + " for p in fig_patt:\n", + " match = re.search(p, fig_text)\n", + " if match:\n", + " return 'f' + match.group(2).replace(\" \",\"\").replace(\".\",\"\").replace(\",\",\"\")\n", + " \n", + " # [1] simplePatt\n", + " # [2,4] space2Patt\n", + " # [2,4,6] space3Patt\n", + " # [2,4] fullComma2Patt\n", + " # [2,3] comma2Patt\n", + " # [1,2] simpleComma2Patt\n", + " # [2,3,4] comma3Patt \n", + " # [1,2,3] simpleComma3Patt\n", + " # [2,3,4,5] comma4Patt\n", + " # [1,2,3,4] simpleComma4Patt\n", + " # [1,2] simpleAnd2Patt\n", + " # [1,2,3] simple_a_comma_b_patt \n", + " # [2,3,4] a_comma_b_patt \n", + " # [2,3,4,5] a_b_and_c_patt \n", + " # [1,2,3,4] simple_a_b_and_c_patt\n", + " \n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simplePatt', [1])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'tableFigPatt', [3])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma2Patt', [2,3])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'fullComma2Patt', [2,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma2Patt', [1,2])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma3Patt', [2,3,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma3Patt', [1,2,3])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma4Patt', [2,3,4,5])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleComma4Patt', [1,2,3,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'comma5Patt', [2,3,4,5,6])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'space2Patt', [2,4])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'space3Patt', [2,4,6])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'simpleAnd2Patt', [1,2])\n", + " if( matched_figs is None ):\n", + " matched_figs = run_simple_matcher(fig_text, patt_hash, 'and2Patt', [2,3])\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_comma_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_comma_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_and_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_and_b_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_b_and_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " c = match.group(5)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_b_and_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " c = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('simple_a_comma_b_comma_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(1)\n", + " a = match.group(2)\n", + " b = match.group(3)\n", + " c = match.group(4)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('a_comma_b_comma_c_patt'), fig_text)\n", + " if( match ):\n", + " f = match.group(2)\n", + " a = match.group(3)\n", + " b = match.group(4)\n", + " c = match.group(5)\n", + " return 'f'+f+a+'|'+'f'+f+b+'|'+'f'+f+c\n", + " if( matched_figs is None ):\n", + " match = re.search(patt_hash.get('intervalPatt'), fig_text)\n", + " if( match ):\n", + " fig_number = match.group(2)\n", + " start = match.group(3)\n", + " end = match.group(4)\n", + " if( len(start) > 1 or len(end)>1 ):\n", + " return None\n", + " matched_str = \"\"\n", + " subfigs = [chr(i) for i in range(ord(start),ord(end)+1)] \n", + " for subfig in subfigs :\n", + " if len(matched_str) > 0 :\n", + " matched_str += '|'\n", + " matched_str += 'f' + fig_number + subfig\n", + " return matched_str\n", + " \n", + " if(matched_figs is not None):\n", + " return build_matched_string(matched_figs, 'f')\n", + " \n", + " matched_tab = run_simple_matcher(fig_text, patt_hash, 'tablePatt', [2])\n", + " if(matched_tab is not None):\n", + " return build_matched_string(matched_tab, 't')\n", + "\n", + " matched_tab = run_simple_matcher(fig_text, patt_hash, 'suppTablePatt', [3])\n", + " if(matched_tab is not None):\n", + " return build_matched_string(matched_tab, 'st')\n", + " \n", + " return None\n", + "\n", + "def extract_simple_intact_data(input, title, tsv_output):\n", + " \n", + " with open(input, 'r') as input_file:\n", + " xml = input_file.read()\n", + " \n", + " # Check if the figure legends are specified\n", + " if \"\\\"figure legend\\\"\" not in xml: \n", + " return \n", + " \n", + " soup = BeautifulSoup(xml, 'lxml') \n", + "\n", + " intact_headings = ['pmid','i_id','orig_fig','fig','type','type_xref','p1_name',\n", + " 'p1_xref','p1_site','p2_name','p2_xref','p2_site','p3_name',\n", + " 'p3_xref','p3_site','i_meth','p_meth']\n", + " intact_rows = []\n", + "\n", + " patt_hash = build_figure_extraction_patterns()\n", + "\n", + " # EXPERIMENTS\n", + " all_expt_dict = {}\n", + " for e in soup.select('experimentlist experimentdescription'):\n", + " ex_dict = {}\n", + " ex_dict['i_meth'] = e.interactiondetectionmethod.names.shortlabel.text\n", + " ex_dict['p_meth'] = e.participantidentificationmethod.names.shortlabel.text \n", + " all_expt_dict[e.get('id')] = ex_dict\n", + "\n", + " # INTERACTORS\n", + " all_int_dict = {}\n", + " for i1 in soup.select('interactorlist interactor'):\n", + " int_dict = {}\n", + " int_dict['name'] = i1.names.shortlabel.text\n", + " urls = []\n", + " for t in i1.select('primaryref[db=\"uniprotkb\"]'):\n", + " if( t.get('reftype') == 'identity' ) :\n", + " urls.append(t.get('id'))\n", + " for t in i1.select('secondaryref[db=\"uniprotkb\"]'):\n", + " if( t.get('reftype') == 'identity' ) :\n", + " urls.append(t.get('id'))\n", + " int_dict['xref'] = urls\n", + " all_int_dict[i1.get('id')] = int_dict\n", + "\n", + " # INTERACTIONS\n", + " for i in soup.select('interactionlist interaction'):\n", + " int_dict = {}\n", + " int_dict['pmid'] = title\n", + " int_dict['i_id'] = i.get('id')\n", + " int_dict['type'] = i.interactiontype.names.shortlabel.text \n", + " int_dict['type_xref'] = i.interactiontype.xref.primaryref.get('id')\n", + " p_count = 1\n", + " for p_tag in i.select('participantlist participant'):\n", + " p_id = p_tag.interactorref.text\n", + " p = all_int_dict[p_id]\n", + " int_dict['p'+str(p_count)+\"_name\"] = p.get('name')\n", + " int_dict['p'+str(p_count)+\"_xref\"] = '|'.join(p.get('xref'))\n", + " p_count += 1\n", + " int_dict['fig'] = '-'\n", + " for a in i.select('attributelist attribute[name]'):\n", + " if( a.get('name') == \"figure legend\" ):\n", + " fig_text = a.text.lower()\n", + " fig_text = run_matcher(fig_text, patt_hash)\n", + " if( fig_text is None):\n", + " print(a.text.lower() + \" : None\")\n", + " int_dict['orig_fig'] = a.text\n", + " int_dict['fig'] = fig_text\n", + " e_id = i.experimentlist.experimentref.text\n", + " e = all_expt_dict.get(e_id)\n", + " if( e is not None ):\n", + " int_dict['i_meth'] = e.get('i_meth', '-')\n", + " int_dict['p_meth'] = e.get('p_meth', '-')\n", + " else: \n", + " int_dict['i_meth'] = '-'\n", + " int_dict['p_meth'] = '-'\n", + " \n", + " r = []\n", + " for h in intact_headings:\n", + " r.append(int_dict.get(h,'-'))\n", + " intact_rows.append(r)\n", + " \n", + " intact_df = pd.DataFrame.from_records(intact_rows, columns=intact_headings) \n", + " intact_df.to_csv(tsv_output, sep='\\t', encoding='utf-8')\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Execution of code to simplify INTACT records from standard XML into TSV format. " + ] + }, + { + "cell_type": "code", + "execution_count": 56, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/simple_intact_files/\n" + ] + } + ], + "source": [ + "stem = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/'\n", + "intact_dir = stem + 'gold_standard/'\n", + "simple_intact_dir = stem + 'simple_intact_files/'\n", + "\n", + "print(simple_intact_dir)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for x in os.walk(intact_dir):\n", + " for infile in glob(os.path.join(x[0], '*.xml')):\n", + " fn = ntpath.basename(infile)\n", + " if( os.path.isfile(infile) and fn.endswith('.xml') ):\n", + " title = fn.replace(\".xml\", \"\")\n", + " if( title not in pmids ):\n", + " continue\n", + "\n", + " outfile = simple_intact_dir + \"/\" + title + \".tsv\"\n", + " if( not os.path.isfile(outfile) ):\n", + " try:\n", + " extract_simple_intact_data(infile, title, outfile)\n", + " except KeyError:\n", + " print(\"KeyError for \" + infile)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Run this script to convert collections of PSI-MI2.5 files to biopax. We've updated the script to run our updated PaxTools from github.com/BMKEG/Paxtools which includes annotations about Figures in Biopax evidence codes. " + ] + }, + { + "cell_type": "code", + "execution_count": 53, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "paxtools_jar = \"/Users/Gully/Coding/git/biopax/Paxtools/paxtools-console/target/paxtools.jar\"\n", + "\n", + "data_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/gold_standard_data\"\n", + "open_access_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/oa_gold_data\"\n", + "biopax_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax\"\n", + "new_biopax_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax_reformat\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# THIS RUNS THE UPDATED PAXTOOLS TO GENERATE BIOPAX 3 DATA FOR OUR USE.\n", + "for root, dirs, files in os.walk(data_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :\n", + " pmid = file[:-4]\n", + "\n", + " if( pmid in pmids ): \n", + " cmds = [\"java\",\"-jar\",paxtools_jar,\"toLevel3\",root+'/'+file,biopax_dir+'/'+pmid+'_biopax.xml','-psimiToComplexes']\n", + " print \" \".join(cmds)\n", + " call(cmds)\n", + " print \"\\tDONE\"" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def reformat_figure_legend_annotations(input):\n", + " \n", + " with open(input, 'r') as input_file:\n", + " xml = input_file.read()\n", + " \n", + " if \">Figure:\" not in xml: \n", + " return \n", + "\n", + " patt_hash = build_figure_extraction_patterns()\n", + " fig_patt = re.compile(\">Figure:(.*?)<\")\n", + "\n", + " output = \"\"\n", + " with open(input, 'r') as input_file:\n", + " for line in input_file.readlines(): \n", + " match = re.search(fig_patt, line)\n", + " if match: \n", + " fig_text = match.group(1).lower()\n", + " new_fig_text = run_matcher(fig_text, patt_hash)\n", + " if( new_fig_text is not None ):\n", + " line = re.sub(fig_patt,\">Figure:\"+new_fig_text+\"<\",line)\n", + " #print fig_text + '==>' + new_fig_text\n", + "\n", + " output += line\n", + "\n", + " return output\n", + "\n", + "# THIS FORMATS FIGURE ANNOTATIONS IN THE UPDATED BIOPAX 3 FILES.\n", + "for root, dirs, files in os.walk(biopax_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :\n", + " # Now, load each BIOPAX 3 file, and run the patterns on text found in the XML\n", + " reformatted_text = reformat_figure_legend_annotations(root+'/'+file)\n", + "\n", + " if reformatted_text is not None:\n", + " with open(new_biopax_dir+'/'+file, 'w') as output_file:\n", + " output_file.write(reformatted_text)\n", + " " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code to find which pmids have intact records " + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from shutil import copyfile\n", + "\n", + "def copy_figure_files(intactFile, figAssigmentDir, outDir):\n", + "\n", + " frames = []\n", + " \n", + " intact_tsv = pd.read_csv(intactFile, sep='\\t')\n", + " \n", + " fries_sentences = []\n", + " fries_hits = []\n", + " fries_events = []\n", + " count = 0\n", + " fries_count = 0\n", + " hit_count = 0\n", + " miss_count = 0\n", + " for i,row in intact_tsv.iterrows():\n", + " pmid = str(row['pmid'])\n", + " fig = str(row['fig'])\n", + " src_file = figAssigmentDir+'/'+pmid+'_'+fig+'.txt'\n", + " dst_file = outDir+'/'+pmid+'_'+fig+'.txt'\n", + " if( os.path.isfile(figAssigmentDir + '/'+pmid+'_'+fig+'.txt') ) :\n", + " copyfile(src_file, dst_file)" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "fig_sentences_dir = stem + 'fig_sentences'\n", + "out_sentences_dir = stem + 'fig_sentences_in_intact'\n", + "\n", + "for root, dirs, files in os.walk(simple_intact_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.tsv' :\n", + " copy_figure_files(root+'/'+file, fig_sentences_dir, out_sentences_dir)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Code to link the intact files to the sciDt data.\n", + "\n", + "This is derived from the simplified TSV-format generated above. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def link_scidt_to_intact(intactFile, scidtDir, outFile):\n", + "\n", + " frames = []\n", + " \n", + " intact_tsv = pd.read_csv(intactFile, sep='\\t')\n", + " \n", + " fries_sentences = []\n", + " fries_hits = []\n", + " fries_events = []\n", + " count = 0\n", + " fries_count = 0\n", + " hit_count = 0\n", + " miss_count = 0\n", + " for i,row in intact_tsv.iterrows():\n", + " pmid = row['pmid']\n", + " print(pmid)\n", + " intact_fig = row['fig']\n", + " p1 = row['p1_xref']\n", + " p2 = row['p2_xref']\n", + " p3 = row['p3_xref']\n", + "\n", + " fries_events_local = []\n", + " \n", + " # find the figure numbers in the paper designation \n", + " scidt_path = os.path.join(scidtDir, str(pmid) + \".tsv\")\n", + " if( os.path.isfile( scidt_path ) ):\n", + " scidt_tsv = pd.read_csv(scidt_path, sep='\\t')\n", + " for i2,row2 in scidt_tsv.iterrows():\n", + " fries_sentence = row2['friesSentenceId'] \n", + " fries_event = row2['friesEventsTypes'] \n", + " scidt_figs = row2['Figure Assignment']\n", + " if( scidt_figs == scidt_figs and fries_event == fries_event):\n", + " for scidt_fig in scidt_figs.split('|'):\n", + " if scidt_fig == intact_fig and 'complex-assembly' in fries_event:\n", + " fries_count += 1\n", + " if( p1 != p1 or p2 != p2 or p3 != p3):\n", + " hit = \"MISS\"\n", + " miss_count += 1\n", + " elif( (p1 == '-' or p1 in fries_event) and \n", + " (p2 == '-' or p2 in fries_event) and \n", + " (p3 == '-' or p3 in fries_event) ):\n", + " hit = \"HIT\"\n", + " hit_count += 1\n", + " else :\n", + " hit = \"MISS\"\n", + " miss_count += 1\n", + " fries_events_local.append(fries_event + '[' + hit + ']')\n", + " \n", + " fries_events.append(fries_events_local)\n", + " \n", + " intact_tsv['fries_events'] = pd.Series(fries_events)\n", + " \n", + " intact_tsv.to_csv(outFile, sep='\\t')\n", + " print (\"COUNT: %d\" % fries_count)\n", + " print (\"HITS: %d\" % hit_count)\n", + " print (\"MISSES: %d\" % miss_count )" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "collapsed": true + }, + "source": [ + "Run through Biopax entries. Load each file and search for evidence. Link that evidence to sentences via figure legends." + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "tsv_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4\"\n", + "new_biopax_dir = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/biopax_reformat\"" + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1\n", + "9\n", + "1\n", + "2\n", + "1\n", + "10\n", + "5\n", + "4\n", + "5\n", + "1\n", + "9\n", + "5\n", + "18\n", + "1\n", + "6\n", + "13\n", + "9\n", + "21\n", + "215\n", + "4\n", + "5\n", + "6\n", + "5\n", + "6\n", + "2\n", + "2\n", + "3\n", + "2\n", + "9\n", + "6\n", + "5\n", + "9\n", + "4\n", + "1\n", + "3\n", + "4\n", + "1\n", + "2\n", + "15\n", + "8\n", + "14\n", + "13\n", + "6\n", + "3\n", + "1\n", + "4\n", + "4\n", + "29\n", + "11\n", + "24\n", + "14\n", + "5\n", + "2\n", + "11\n", + "7\n", + "14\n", + "2\n", + "6\n", + "5\n", + "3\n", + "2\n", + "6\n", + "5\n", + "1\n", + "5\n", + "3\n", + "1\n", + "12\n", + "5\n", + "6\n", + "1\n", + "7\n", + "13\n", + "2\n", + "6\n", + "2\n", + "6\n", + "8\n", + "4\n", + "1\n", + "1\n", + "8\n", + "10\n", + "2\n", + "1\n", + "10\n", + "7\n", + "7\n", + "1\n", + "2\n", + "13\n", + "8\n", + "8\n", + "23\n", + "8\n", + "7\n", + "9\n", + "5\n", + "89\n", + "4\n", + "2\n", + "9\n", + "6\n", + "10\n", + "4\n", + "47\n", + "4\n", + "54\n", + "14\n", + "7\n", + "1\n", + "3\n", + "17\n", + "1\n", + "5\n", + "2\n", + "4\n", + "5\n", + "19\n", + "2\n", + "14\n", + "2\n", + "2\n", + "6\n", + "4\n", + "4\n", + "3\n", + "1\n", + "3\n", + "10\n", + "2\n", + "3\n", + "6\n", + "279\n", + "6\n", + "2\n", + "2\n", + "32\n", + "8\n", + "1\n", + "2\n", + "2\n", + "5\n", + "11\n", + "4\n", + "4\n", + "2\n", + "11\n", + "2\n", + "1\n", + "8\n", + "2\n", + "4\n", + "4\n", + "6\n", + "12\n", + "12\n", + "2\n", + "18\n", + "2\n", + "3\n", + "14\n", + "77\n", + "1\n", + "4\n", + "4\n", + "10\n", + "6\n", + "10\n", + "3\n", + "9\n", + "12\n", + "30\n", + "4\n", + "20\n", + "3\n", + "3\n", + "19\n", + "2\n", + "22\n", + "2\n", + "3\n", + "2\n", + "1\n", + "21\n", + "1\n", + "3\n", + "6\n", + "1\n", + "17\n", + "1\n", + "58\n", + "13\n", + "5\n", + "8\n", + "4\n", + "1\n", + "3\n", + "1\n", + "13\n", + "13\n", + "7\n", + "2\n", + "1\n", + "22\n", + "5\n", + "32\n", + "20\n", + "20\n", + "3\n", + "2\n", + "2\n", + "7\n", + "2\n", + "4\n", + "6\n", + "6\n", + "3\n", + "4\n", + "4\n", + "7\n", + "5\n", + "6\n", + "3\n", + "5\n", + "2\n", + "5\n", + "5\n", + "21\n", + "16\n", + "3\n", + "3\n", + "2\n", + "7\n", + "2\n", + "10\n", + "351\n", + "18\n", + "2\n", + "28\n", + "2\n", + "478\n", + "1\n", + "565\n", + "27\n", + "3\n", + "2\n", + "4\n", + "4\n", + "4\n", + "1\n", + "11\n", + "3\n", + "3\n", + "5\n", + "13\n", + "7\n", + "3\n", + "195\n", + "15\n", + "6\n", + "4\n", + "7\n", + "2\n", + "3\n", + "8\n", + "5\n", + "7\n", + "24\n", + "3\n", + "4\n", + "1\n", + "2\n", + "22\n", + "6\n", + "27\n", + "2\n", + "3\n", + "1\n", + "2\n", + "2\n", + "9\n", + "5\n", + "13\n", + "4\n", + "10\n", + "25\n", + "3\n", + "2\n", + "1\n", + "2\n", + "2\n", + "14\n", + "2\n", + "3\n", + "11\n", + "2\n", + "4\n", + "6\n", + "314\n", + "4\n", + "9\n", + "6\n", + "10\n", + "10\n", + "753\n", + "9\n", + "7\n", + "1\n", + "12\n", + "1\n", + "1\n", + "10\n", + "16\n", + "13\n", + "51\n", + "1710\n", + "7\n", + "2\n", + "7\n", + "10\n", + "1\n", + "30\n", + "8\n", + "20\n", + "5\n", + "11\n", + "13\n", + "7\n", + "14\n", + "3\n", + "5\n", + "9\n", + "14\n", + "3\n", + "2\n", + "3\n", + "1\n", + "3\n", + "6\n", + "1\n", + "5\n", + "129\n", + "3\n", + "8\n", + "11\n", + "7\n", + "20\n", + "12\n", + "2\n", + "3\n", + "2\n", + "4\n", + "397\n", + "26\n", + "11\n", + "4\n", + "1\n", + "3\n", + "4\n", + "11\n", + "5\n", + "20\n", + "48\n", + "60\n", + "2\n", + "2\n", + "2\n", + "9\n", + "2\n", + "2\n", + "2\n", + "11\n", + "2\n", + "6\n", + "4\n", + "4\n", + "7\n", + "2\n", + "275\n", + "4\n", + "10\n", + "1\n", + "1\n", + "2\n", + "11\n", + "16\n", + "410\n", + "1\n", + "200\n", + "10\n", + "2\n", + "7\n", + "59\n", + "3\n", + "4\n", + "2\n", + "4\n", + "9\n", + "1\n", + "2\n", + "8\n", + "19\n", + "1\n", + "7\n", + "10\n", + "3\n", + "1\n", + "1\n", + "2\n", + "167\n", + "5\n", + "1\n", + "6\n", + "4\n", + "6\n", + "11\n", + "4\n", + "9\n", + "4\n", + "3\n", + "8\n", + "39\n", + "2\n", + "3\n", + "2\n", + "8\n", + "9\n", + "5\n", + "7\n", + "37\n", + "1\n", + "3\n", + "2\n", + "2\n", + "11\n", + "2\n", + "12\n", + "639\n", + "1\n", + "3\n", + "3\n", + "7\n", + "4\n", + "5\n", + "1\n", + "4\n", + "9\n", + "14\n", + "8\n", + "2\n", + "26\n", + "162\n", + "4\n", + "5\n", + "3\n", + "1\n", + "3\n", + "1\n", + "16\n", + "4\n", + "219\n", + "1\n", + "2\n", + "18\n", + "5\n", + "16\n", + "12\n", + "8\n", + "3\n", + "17\n", + "14\n", + "4\n", + "2\n", + "6\n", + "14\n", + "1\n", + "3\n", + "122\n", + "10\n", + "3\n", + "8\n", + "3\n", + "2\n", + "4\n", + "5\n", + "1\n", + "1\n", + "3\n", + "19\n", + "4\n", + "15\n", + "4\n", + "6\n", + "4\n", + "10\n", + "11\n", + "96\n", + "13\n", + "7\n", + "1\n", + "9\n", + "9\n", + "11\n", + "7\n", + "10\n", + "4\n", + "17\n", + "1\n", + "4\n", + "1\n", + "3\n", + "5\n", + "35\n", + "7\n", + "6\n", + "11\n", + "30\n", + "2\n", + "51\n", + "20\n", + "2\n", + "7\n", + "10\n", + "7\n", + "9\n", + "3\n", + "2\n", + "1\n", + "4\n", + "3\n", + "1\n", + "2\n", + "12\n", + "6\n", + "2\n", + "2\n", + "818\n", + "2\n", + "6\n", + "4\n", + "1\n", + "10\n", + "3\n", + "6\n", + "42\n", + "6\n", + "3\n", + "8\n", + "2\n", + "3\n", + "39\n", + "4\n", + "4\n", + "11\n", + "6\n", + "2\n", + "1\n", + "1\n", + "3\n", + "5\n", + "4\n", + "6\n", + "3\n", + "16\n", + "159\n", + "9\n", + "62\n", + "5\n", + "1\n", + "459\n", + "4\n", + "12\n", + "13\n", + "5\n", + "2\n", + "4\n", + "1\n", + "7\n", + "8\n", + "17\n", + "7\n", + "3\n", + "2\n", + "2\n", + "2\n", + "8\n", + "6\n", + "4\n", + "17\n", + "16\n", + "5\n", + "6\n", + "11\n", + "12\n", + "29\n", + "97\n", + "12\n", + "17\n", + "4\n", + "3\n", + "10\n", + "12\n", + "162\n", + "2\n", + "4\n", + "6\n", + "1\n", + "283\n", + "1\n", + "2\n", + "4\n", + "1\n", + "5\n", + "10\n", + "1\n", + "31\n", + "16\n", + "3\n", + "3\n", + "74\n", + "5\n", + "3\n", + "11\n", + "2\n", + "4\n", + "4\n", + "3\n", + "14\n", + "2\n", + "34\n", + "2\n", + "4\n", + "1\n", + "2\n", + "3\n", + "14\n", + "4\n", + "2\n", + "8\n", + "6\n", + "8\n", + "46\n", + "10\n", + "2\n", + "22\n", + "15\n", + "2\n", + "2\n", + "7\n", + "6\n", + "2\n", + "3\n", + "2\n", + "7\n", + "9\n", + "6\n", + "379\n", + "6\n", + "1\n", + "21\n", + "3\n", + "40\n", + "2\n", + "20\n", + "4\n", + "22\n", + "4\n", + "3\n", + "8\n", + "7\n", + "11\n", + "7\n", + "1\n", + "10\n", + "3\n", + "28\n", + "29\n", + "4\n", + "6\n", + "1\n", + "4\n", + "25\n", + "7\n", + "8\n", + "4\n", + "2\n", + "6\n", + "3\n", + "3\n", + "1\n", + "6\n", + "2\n", + "5\n", + "13\n", + "3\n", + "1\n", + "2\n", + "1\n", + "13\n", + "6\n", + "2\n", + "6\n", + "1\n", + "17\n", + "3\n", + "7\n", + "5\n", + "22\n", + "7\n", + "8\n", + "156\n", + "6\n", + "4\n", + "4\n", + "15\n", + "7\n", + "1\n", + "7\n", + "11\n", + "4\n", + "30\n", + "2\n", + "2\n", + "2\n", + "172\n", + "7\n", + "10\n", + "18\n", + "6\n", + "16\n", + "1\n", + "2\n", + "20\n", + "3\n", + "9\n", + "1\n", + "18\n", + "884\n", + "25\n", + "4\n", + "6\n", + "6\n", + "5\n", + "3\n", + "5\n", + "2\n", + "4\n", + "2\n", + "11\n", + "1130\n", + "166\n", + "3\n", + "5\n", + "30\n", + "22\n", + "6\n", + "5\n", + "34\n", + "1\n", + "3\n", + "4\n", + "2\n", + "4\n", + "4\n", + "10\n", + "2\n", + "34\n", + "12\n", + "5\n", + "1\n", + "17\n", + "4\n", + "7\n", + "32\n", + "2\n", + "9\n", + "20\n", + "3\n", + "25\n", + "2\n", + "6\n", + "6\n", + "12\n", + "9\n", + "38\n", + "5\n", + "6\n", + "5\n", + "8\n", + "16\n", + "4\n", + "87\n", + "6\n", + "10\n", + "4\n", + "1\n", + "54\n", + "23\n", + "13\n", + "5\n", + "8\n", + "18\n", + "8\n", + "2\n", + "3\n", + "2\n", + "10\n", + "11\n", + "3\n", + "12\n", + "2\n", + "25\n", + "5\n", + "3\n", + "5\n", + "13\n", + "5\n", + "2\n", + "22\n", + "25\n", + "4\n", + "5\n", + "2\n", + "2\n", + "6\n", + "10\n", + "5\n", + "2\n", + "3\n", + "4\n", + "5\n", + "1\n", + "22\n", + "2\n", + "3\n", + "3\n", + "14\n", + "27\n", + "6\n", + "3\n", + "2\n", + "14\n", + "24\n", + "3\n", + "14\n", + "3\n", + "2\n", + "9\n", + "4\n", + "6\n", + "3\n", + "34\n", + "4\n", + "5\n", + "112\n", + "2\n", + "2\n", + "1\n", + "8\n", + "2\n", + "1\n", + "6\n", + "1\n", + "1\n", + "19\n", + "2\n", + "7\n", + "1\n", + "2\n", + "13\n", + "5\n", + "10\n", + "23\n", + "9\n", + "6\n", + "4\n", + "8\n", + "2\n", + "17\n", + "2\n", + "1\n", + "10\n", + "11\n", + "4\n", + "7\n", + "5\n", + "1\n", + "1\n", + "36\n", + "4\n", + "3\n", + "4\n", + "3\n", + "7\n", + "27\n", + "3\n", + "12\n", + "8\n", + "139\n", + "274\n", + "59\n", + "16\n", + "404\n", + "14\n", + "8\n", + "8\n", + "131\n", + "3\n", + "12\n", + "5\n", + "69\n", + "1\n", + "163\n", + "3\n", + "6\n", + "2\n", + "7\n", + "2\n", + "1\n", + "11\n", + "1\n", + "2\n", + "3\n", + "4\n", + "1\n", + "5\n", + "1\n", + "9\n", + "8\n", + "9\n", + "7\n", + "5\n", + "1\n", + "2\n", + "3\n", + "4\n", + "6\n", + "2\n", + "2\n", + "9\n" + ] + } + ], + "source": [ + "import uuid\n", + "import pandas as pd\n", + "import json\n", + "\n", + "def generate_annotation_page(pmid, biopax_path, scidt_path):\n", + " annotation_items = []\n", + " annotation_page = {\n", + " \"@context\": \"http://www.w3.org/ns/anno.jsonld\",\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotation_page/\"+pmid,\n", + " \"type\": \"AnnotationPage\",\n", + " \"partOf\": {\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotations\"\n", + " },\n", + " \"next\": \"http://example.org/page2\",\n", + " \"startIndex\": 0,\n", + " \"items\": annotation_items\n", + " }\n", + " \n", + " biopax_lines = []\n", + " with open(biopax_path, 'r') as biopax_file:\n", + " biopax_lines = biopax_file.readlines()\n", + "\n", + " scidt_tsv = pd.read_csv(scidt_path, sep='\\t')\n", + " \n", + " we_are_on = False\n", + " evidence_patt = re.compile(\"\")\n", + " figure_patt = re.compile(\">Figure:(.*?)<\")\n", + " evidence_off_patt = re.compile(\"<\\/bp:Evidence>\")\n", + "\n", + " evidence_code = ''\n", + " figure_code = ''\n", + " for biopax_line in biopax_lines: \n", + " evidence_match = re.search(evidence_patt, biopax_line)\n", + " if evidence_match: \n", + " evidence_code = evidence_match.group(1)\n", + " figure_code = ''\n", + " we_are_on = True\n", + "\n", + " figure_match = re.search(figure_patt, biopax_line)\n", + " if figure_match: \n", + " figure_code = figure_match.group(1)\n", + " \n", + " if we_are_on and len(figure_code)>0:\n", + " \n", + " targets = []\n", + " annotation = {\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotations/\"+pmid+'#'+str(count),\n", + " \"type\": \"Annotation\",\n", + " \"body\": {\n", + " \"id\": evidence_code,\n", + " \"type\": \"Dataset\"\n", + " },\n", + " \"target\": targets\n", + " }\n", + " annotation_items.append(annotation)\n", + " \n", + " for i, row in scidt_tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " text = row['Sentence Text']\n", + " codeStr = row['Codes']\n", + " expts = row['ExperimentValues']\n", + " paragraph = row['Paragraph']\n", + " heading = row['Headings']\n", + " discourse = row['Discourse Type']\n", + " offset_start = row['Offset_Begin']\n", + " offset_end = row['Offset_End']\n", + " fig = row['Figure Assignment']\n", + " \n", + " if(fig != fig):\n", + " continue\n", + " \n", + " for f in re.split(\"|\", fig):\n", + " if( f in figure_code):\n", + " \n", + " targets.append({\n", + " \"source\": \"https://www.ncbi.nlm.nih.gov/pmc/articles/pmid/\" + str(pmid),\n", + " \"selector\": [{\n", + " \"type\": \"TextQuoteSelector\",\n", + " \"exact\": text\n", + " },\n", + " {\n", + " \"type\": \"TextPositionSelector\",\n", + " \"start\": offset_start,\n", + " \"end\": offset_end\n", + " }]\n", + " })\n", + " \n", + " annotation['target'] = targets\n", + " we_are_on = False\n", + " \n", + " #print len(annotation_items)\n", + " annotation_page['items'] = annotation_items\n", + " \n", + " return annotation_page\n", + "\n", + "annotation_collection_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/annotation_collection.json\"\n", + "annotation_pages_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pages\"\n", + "\n", + "page = {}\n", + "annotation_collection = {\n", + " \"@context\": \"http://www.w3.org/ns/anno.jsonld\",\n", + " \"id\": \"http://sciknowengine.isi.edu/iswc17/annotations\",\n", + " \"type\": \"AnnotationCollection\",\n", + " \"label\": \"Anntoations linking BioPax records from the INTACT database to text fragments describing evidence\",\n", + " \"total\": 0,\n", + " \"first\": page\n", + "}\n", + "\n", + "count = 0\n", + "annotation_pages = {}\n", + "last_annotation_page = None\n", + "for root, dirs, files in os.walk(new_biopax_dir):\n", + " for file in files: \n", + " if os.path.isfile(root+'/'+file) and file[-4:]=='.xml' :\n", + " l = len('_biopax.xml')\n", + " pmid = file[:-l]\n", + " tsv_file = tsv_dir+'/'+str(pmid)+'.tsv'\n", + " \n", + " if not os.path.isfile(tsv_file):\n", + " continue\n", + " \n", + " annotation_page = generate_annotation_page(pmid, root+'/'+file, tsv_dir+'/'+str(pmid)+'.tsv')\n", + " count += 1\n", + " #print json.dumps(annotation_page, sort_keys=True, indent=4, separators=(',', ': '))\n", + " \n", + " if(last_annotation_page is None):\n", + " annotation_collection['first'] = annotation_page['id']\n", + " else:\n", + " last_annotation_page['next'] = annotation_page['id']\n", + " \n", + " annotation_page_dump = json.dumps(annotation_page, sort_keys=True, indent=4, separators=(',', ': '))\n", + " with open(annotation_pages_path+'/page_'+pmid+'.json', 'w') as annotation_page_file:\n", + " annotation_page_file.write(annotation_page_dump)\n", + " \n", + " last_annotation_page = annotation_page\n", + " \n", + "annotation_collection['total'] = count\n", + "annotation_collection_dump = json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))\n", + "with open(annotation_collection_path, 'w') as annotation_collection_file:\n", + " annotation_collection_file.write(annotation_collection_dump)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "annotation_collection_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/annotation_collection.json\"\n", + "annotation_pages_path = \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/pages\"\n", + "\n", + "annotation_collection_dump = json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))\n", + "with open(annotation_collection_path, 'w') as annotation_collection_file:\n", + " annotation_collection_file.write(annotation_collection_dump)\n", + "\n", + "for pmid in annotation_pages.keys():\n", + " page = annotation_pages[pmid]\n", + " annotation_page_dump = json.dumps(page, sort_keys=True, indent=4, separators=(',', ': '))\n", + " with open(annotation_pages_path+'/page_'+pmid+'.json', 'w') as annotation_page_file:\n", + " annotation_page_file.write(annotation_page_dump)\n", + " \n", + "print json.dumps(annotation_collection, sort_keys=True, indent=4, separators=(',', ': '))\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/notebooks/word-movers-distance-in-python.ipynb b/notebooks/word-movers-distance-in-python.ipynb new file mode 100644 index 0000000..50a2b06 --- /dev/null +++ b/notebooks/word-movers-distance-in-python.ipynb @@ -0,0 +1,1358 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Word mover's distance classification in Python\n", + "\n", + "*A guide to scikit-learn compatible nearest neighbors classification using the recently introduced word mover's distance (WMD). *\n", + "Joint post with the awesome [Matt Kusner](http://matthewkusner.com)!\n", + "\n", + "[Source of this Jupyter notebook.](http://nbviewer.jupyter.org/github/vene/vene.github.io/blob/pelican/content/blog/word-movers-distance-in-python.ipynb)\n", + "\n", + "In document classification and other natural language processing applications, having a good measure of the similarity of two texts can be a valuable building block. Ideally, such a measure would capture semantic information. Cosine similarity on bag-of-words vectors is known to do well in practice, but it inherently cannot capture when documents say the same thing in completely different words.\n", + "\n", + "Take, for example, two headlines:\n", + "\n", + " * *Obama speaks to the media in Illinois*\n", + " * *The President greets the press in Chicago*\n", + "\n", + "These have no content words in common, so according to most bag of words--based metrics, their distance would be maximal. (For such applications, you probably don't want to count stopwords such as *the* and *in*, which don't truly signal semantic similarity.)\n", + "\n", + "One way out of this conundrum is the word mover's distance (WMD), introduced in \n", + "[*From Word Embeddings To Document Distances*](http://mkusner.github.io/publications/WMD.pdf),\n", + "(Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger, ICML 2015).\n", + "WMD adapts the [earth mover's distance](https://en.wikipedia.org/wiki/Earth_mover%27s_distance) to the space of documents: the distance between two texts is given by the total amount of \"mass\" needed to move the words from one side into the other, multiplied by the distance the words need to move. So, starting from a measure of the distance between different words, we can get a principled document-level distance. Here is a visualisation of the idea, from the ICML slides:\n", + "\n", + "![WMD example from Matt's slides](https://vene.ro/images/wmd-obama.png)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare some word embeddings\n", + "\n", + "The key ingredient in WMD is a good distance measure between words. Dense representations of words, also known by the trendier name \"word embeddings\" (because \"distributed word representations\" didn't stick), do the trick here. We could train the embeddings ourselves, but for meaningful results we would need tons of documents, and that might take a while. So let's just use the ones from the [`word2vec`](https://code.google.com/p/word2vec/) team. [(download link)](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit?usp=sharing)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import os\n", + "\n", + "import numpy as np\n", + "import gensim\n", + "\n", + "from sklearn.datasets import fetch_20newsgroups\n", + "from sklearn.feature_extraction.text import CountVectorizer\n", + "from sklearn.cross_validation import train_test_split\n" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false, + "scrolled": true + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "s6 (,hypothesis|implication) [X] : In support of distinct peroxisomal binding sites for Pex7p , the Pex7p/Pex13p and Pex7p/ Pex14p complexes can form independently. Genetic evidence for the interaction of Pex7p and Pex13p is provided by the observation that overexpression of Pex13p suppresses a loss of function mutant of Pex7p .\n", + "s8 (,implication|implication) [0] : NH2-terminal regions of Pex13p are required for its interaction with the PTS2-receptor while the COOH-terminal SH3 domain alone is sufficient to mediate its interaction with the PTS1-receptor .\n", + "s9 (,result|implication) [0] : Reinvestigation of the topology revealed both termini of Pex13p to be oriented towards the cytosol .\n", + "s10 (,result|implication) [0] : We also found Pex13p to be required for peroxisomal association of Pex14p , yet the SH3 domain of Pex13p may not provide the only binding site for Pex14p at the peroxisomal membrane .\n", + "s18 (,result|result) [0] : pex7Delta cells exhibit the reverse phenotype ( for review see exLink ) .\n", + "s19 (,implication) [0] : The intracellular localization of both targeting signal receptors is still a matter of debate .\n", + "s20 (,result|result) [0] : A predominantly cytosolic , membrane-bound , and even intraperoxisomal localization have been reported for both receptors ( for review see exLink ) .\n", + "s23 (,result|implication) [0] : There is no experimental evidence for this model , but it is consistent with the observation that peroxisomes are able to import both folded and oligomeric proteins ( for review see exLink ) .\n", + "s30 (,implication|implication) [0] : Together , these data suggest that the two import pathways are not independent but overlapping , with Pex14p as the point of convergence of the pathways at the peroxisomal membrane ( exLink ) .\n", + "s33 (,result|method) [X] : In addition , overexpression of Pex13p suppresses the protein import defect caused by HA-tagged , functionally compromised Pex7p , further suggesting an interaction between the two proteins by genetic means .\n", + "s35 (,result|result) [0] : Reinvestigation of the membrane topology of Pex13p revealed that both termini of the protein are exposed to the cytosol .\n", + "s36 (,implication) [0] : Pex13p was also required for Pex14p localization at the peroxisomal membrane .\n", + "s37 (,result) [0] : However , the peroxisomal targeting of Pex14p did not require interaction with the SH3 domain of Pex13p .\n", + "s133 (Results,result|hypothesis) [0] : It has been reported that the import receptors Pex5p and Pex7p interact with each other in the yeast two-hybrid system , which opened the possibility that both proteins may form a heteromeric cytosolic signal recognition complex ( exLink ) .\n", + "s134 (Results,result|hypothesis) [0] : However , the yeast two-hybrid system does not necessarily distinguish between a direct and indirect binding of two S. cerevisiae proteins , as endogenous proteins may contribute to the observed interaction .\n", + "s135 (Results,hypothesis|result) [X] : As Pex14p can bind both import receptors , we investigated whether the Pex5p/Pex7p interaction is still observed in a yeast two-hybrid reporter strain deleted for the genomic copy of PEX14 ( exLink ) .\n", + "s143 (Results,method|result) [0] : Pex14p and Pex13p , but not Fbp1p , pelleted , indicating the complete sedimentation of cytosol-free peroxisomal membranes ( Fig. 2 ) .\n", + "s144 (Results,result|result) [0] : As reported previously ( exLink ) , mycPex7p was predominantly found in the soluble fraction in wild-type cells , while a low but significant amount was detected in the membrane fraction .\n", + "s145 (Results,result|result) [0] : A decrease of mycPex7p in the pellet fraction of pex14Delta cells ( Fig. 2 ) suggests that the majority of sedimentable Pex7p associates with membranes in a Pex14p-dependent manner .\n", + "s146 (Results,result|implication) [X] : However , in pex14Delta cells a significant amount of mycPex7p was detected in the membrane pellet fraction ( Fig. 2 ) , indicating that next to Pex14p additional binding factors for Pex7p exist at the peroxisomal membrane .\n", + "s147 (Results,result) [0] : Coimmunoprecipitation of Pex13p and Pex7p in the Absence of Pex14p and Pex5p\n", + "s149 (Results,result|result) [0] : As reported previously , we found Pex5p , Pex13p , Pex14p , and Pex17p associated with mycPex7p when the receptor was precipitated from wild-type or complemented pex7Delta cells ( exLink ; exLink ) .\n", + "s150 (Results,result) [0] : Comparison of the constituents of the precipitates revealed five interesting observations .\n", + "s151 (Results,result|implication) [0] : First , in pex14Delta and pex5Delta/pex14Delta strains , Pex13p still coimmunoprecipitated with mycPex7p ( Fig. 3 ) , suggesting that Pex13p associates directly or indirectly with Pex7p .\n", + "s152 (Results,result|implication) [0] : Moreover , this result indicated that neither Pex14p nor Pex5p is required for the formation of this subcomplex of Pex13p and Pex7p .\n", + "s153 (Results,result|result) [X] : Second , the amount of Pex5p in the precipitate from pex14Delta cells was drastically reduced , while the amount in Pex13p remained essentially unchanged ( Fig. 3 , lane pex14Delta ) .\n", + "s154 (Results,implication|implication) [X] : This result supports the notion that the amount of Pex5p bound to Pex13p does not determine the stoichiometry of the Pex13p-Pex7p subcomplex .\n", + "s155 (Results,implication|implication) [0] : However , it also suggests that Pex13p may not bind both import receptors equally at the same time .\n", + "s156 (Results,result) [0] : Third , Pex13p , Pex14p , and Pex5p still coimmunoprecipitated with Pex7p in pex17Delta cells ( Fig. 3 , lane pex17Delta ) .\n", + "s157 (Results,fact|implication) [0] : Obviously , Pex7p is associated with components of the peroxisomal translocation machinery in the absence of Pex17p , suggesting that the presence of Pex17p is not a prerequisite for docking of Pex7p to the peroxisomal membrane .\n", + "s158 (Results,result|implication) [0] : Fourth , the lack of Pex17p in the coimmunoprecipitate from pex14Delta cells ( Fig. 3 , lane pex14Delta ) , suggests that Pex14p is required for the association of Pex17p with the complex , and is consistent with the assumption that Pex17p binding to the complex may be via Pex14p .\n", + "s159 (Results,implication|result) [0] : However , this observation must be interpreted with care since the pex14Delta cells contain much less immunologically detectable Pex17p ( exLink ) .\n", + "s160 (Results,result|result) [X] : Finally , the amount of Fox3p that coimmunoprecipitates with Pex7p drastically increases in mutants with an import defect for PTS2 proteins ( pex17Delta , pex13Delta , pex14Delta , and pex5Delta/pex14Delta ) relative to the strains unaffected in this pathway ( wild-type and pex5Delta ) .\n", + "s161 (Results,result|result) [X] : Since the total amount of both proteins is similar in all strains ( Fig. 3 B ) , it seems unlikely that the observed Pex7p/Fox3p complex has formed in vitro after cell disruption .\n", + "s162 (Results,hypothesis|implication) [0] : A simple explanation for this may be that the high cytosolic concentration of thiolase in the import mutants results in greater occupation of the PTS2 receptor .\n", + "s164 (Results,result|result) [0] : These proteins were not detected in any of the samples , indicating the specificity of the observed interactions ( data not shown ) .\n", + "s166 (Results,result|goal) [0] : The observed in vivo association of Pex7p with Pex13p in cells lacking Pex14p and Pex5p encouraged us to analyze the interaction of these proteins in more detail .\n", + "s169 (Results,result|result) [0] : The results shown in Fig. 4 A reveal that the full length Pex13p is indeed able to interact with the PTS2-receptor Pex7p .\n", + "s170 (Results,result|result) [0] : The controls included show that coexpression of either of the fusion proteins alone did not support transcription activation of the reporter genes .\n", + "s173 (Results,result|result) [X] : Because Pex13pE320K lost the ability to interact with Pex14p in the yeast two-hybrid system ( Fig. 4 B , see also Fig. 8 ) , this experiment was expected to monitor the Pex13p/Pex7p interaction upon simultaneous elimination of the Pex14p and Pex5p influence .\n", + "s174 (Results,result|result) [0] : As shown in Fig. 4 , these two-hybrid analyses did not reveal an influence of Pex5p or Pex14p on the Pex13p/ Pex7p interaction .\n", + "s175 (Results,result|result) [X] : No difference was observed independent of whether the Pex7p/Pex13p interaction was analyzed in wild-type , pex5Delta , or pex14Delta strains ( Fig. 4 A ) , or for the Pex7p/Pex13pE320K interaction in pex5Delta cells ( Fig. 4 B ) .\n", + "s176 (Results,implication|implication) [0] : These results indicate that neither Pex14p nor Pex5p is required for the in vivo interaction of Pex7p with Pex13p , and therefore are in agreement with results obtained in the coimmunoprecipitation experiment ( Fig. 3 ) .\n", + "s177 (Results,result|implication) [0] : The two-hybrid interaction of the complete Pex13p with Pex14p is only detected by histidine prototrophy ( Fig. 4 B ) , indicating that regions NH2-terminal of the SH3 domain of Pex13p may weaken the interaction of these proteins in the two-hybrid system .\n", + "s179 (Results,fact|result) [0] : Mutant cells lacking Pex7p are characterized by their inability to grow on oleic acid as the sole carbon source ( Fig. 5 A ) and by mislocalization of peroxisomal thiolase to the cytosol ( exLink ; exLink ) .\n", + "s180 (Results,result) [X] : Expression of a COOH-terminally HA-tagged Pex7p from the low copy plasmid pRSPEX7-HA3 leads only to a partial complementation of the pex7Delta mutant phenotype ( exLink ) .\n", + "s181 (Results,implication|result) [0] : This is indicated by the inability of the transformants to grow on oleic acid plates ( Fig. 5 A ) and a reduced ability to import Fox3p ( thiolase ) into peroxisomes .\n", + "s182 (Results,result) [0] : The latter is evident by the pronounced cytosolic mislocalization of this protein ( Fig. 5 B , panel d ) .\n", + "s183 (Results,result|goal) [X] : This mutant phenotype of pex7Delta [ pRSPEX7-HA3 ] was employed to investigate whether overexpression of Pex7p-binding partners may suppress a defect in Pex7p function .\n", + "s185 (Results,result|result) [X] : As judged by their growth characteristics on oleic acid medium ( Fig. 5 A ) and by the fluorescence pattern for thiolase ( Fig. 5 B ) , overexpression of PEX13 , but not PEX14 , rescued the mutant phenotype caused by the defective Pex7p-HA .\n", + "s186 (Results,result|implication) [X] : Even though the suppression was not as efficient as complementation with the wild-type PEX7 , this observation demonstrates that Pex13p can suppress the mutant phenotype of pex7Delta [ pRSPEX7-HA3 ] , providing genetic evidence for an interaction between Pex7p and Pex13p .\n", + "s191 (Results,result|result) [0] : The tag has been shown previously not to affect the function of Pex13p ( exLink ) .\n", + "s193 (Results,result|result) [X] : As judged by immunoblot analysis , both the NH2-terminal myc-tag as well as the SH3 domain of Pex13p were rapidly degraded by the protease ( Fig. 6 ) .\n", + "s194 (Results,result) [0] : Intraperoxisomal thiolase remained stable under these conditions and was only degraded in the presence of detergents ( data not shown ) .\n", + "s195 (Results,implication|implication) [0] : From this data , we conclude that both the NH2 terminus and the COOH-terminal SH3 domain are exposed to the cytosol .\n", + "s196 (Results,implication) [0] : This result also implicates the presence of an even number of transmembrane spans within Pex13p .\n", + "s203 (Results,method|implication) [0] : This observation suggests that Pex17p is not required for the targeting of Pex14p to the peroxisomal membrane .\n", + "s204 (Results,implication) [0] : In contrast , no congruent fluorescence patterns were observed in pex13Delta cells .\n", + "s205 (Results,result|fact) [0] : Since the HA-tagged Pex11p is known to be targeted to peroxisomal membrane ghosts in pex13Delta cells ( exLink ) , the lack of congruence suggests that the majority of Pex14p is mislocalized .\n", + "s206 (Results,implication|goal) [0] : To confirm this result by independent means , we performed a flotation of wild-type , pex13Delta , and pex17Delta homogenates in sucrose gradients ( Fig. 7 B ) .\n", + "s209 (Results,fact|result) [0] : However , Pex14p was not detected in these fractions , but was found to cosegregate with mitochondrial fumarase .\n", + "s210 (Results,fact|implication) [0] : These data suggest that the peroxisomal membrane ghosts in pex13Delta cells lack Pex14p .\n", + "s211 (Results,implication) [0] : Thus , the presence of Pex13p is a prerequisite for peroxisomal membrane association of Pex14p .\n", + "s212 (Results,implication|hypothesis) [0] : Pex13p could be involved in targeting , or it could be required for binding or retention of Pex14p at the peroxisome .\n", + "s217 (Results,result) [0] : Remarkably , the mutated Pex14pAXXA still complemented the peroxisome biogenesis defect of pex14Delta cells ( data not shown ) .\n", + "s219 (Results,result|result) [0] : This mutation has been reported to result in the inactivation of Pex13p function ( exLink ) .\n", + "s220 (Results,result|result) [0] : As shown in Fig. 8 , the mutated Pex14pAXXA had lost the ability to bind Pex13p in the yeast two-hybrid system while binding to Pex5p , Pex7p , and oligomerization of the protein was unchanged .\n", + "s221 (Results,result) [0] : Also the E320K mutation of Pex13p abolished the two-hybrid interaction of the SH3 domain of Pex13p with Pex14p ( Fig. 8 ) .\n", + "s222 (Results,implication|implication) [0] : These results suggest that strong interactions between Pex14p and the SH3 domain of Pex13p are dependent on the PXXP motif within Pex14p , as well as on the RT loop of the SH3 domain of Pex13p .\n", + "s223 (Results,method|result) [0] : Next , we analyzed the Pex14pAXXA ( Fig. 9 A ) association with peroxisomal membrane ghosts of pex14Delta/pex17Delta double mutants which were predicted to contain peroxisomal membrane ghosts even upon complementation of the pex14Delta mutation .\n", + "s226 (Results,result) [0] : Colocalization was observed for HA-Pex11p and Pex14pAXXA in pex14Delta cells , as well as for HA-Pex11p and Pex14p in pex13Delta cells expressing Pex13pE320K , indicative of peroxisomal membrane association of these proteins ( Fig. 9 A ) .\n", + "s227 (Results,result|result) [0] : These results were corroborated by flotation analysis which revealed that Pex14pAXXA was associated with the fraction containing the peroxisomal membrane ghosts of pex14Delta/pex17Delta , as were Pex14p in pex13Delta/pex17Delta cells expressing Pex13pE320K ( Fig. 9 B ) .\n", + "s228 (Results,implication|implication) [0] : These observations suggest that Pex14p is associated with peroxisomes and peroxisomal membrane ghosts independent of interaction between the proline-rich motif of Pex14p and the RT loop in the SH3 domain of Pex13p .\n", + "s229 (Results,result|implication) [0] : Interestingly , the fractionation of pex13Delta/ pex17Delta [ PEX13E320K ] shows that , although the RT loop of the SH3 domain of Pex13p is not absolutely required for the targeting of Pex14p to the membrane of peroxisomal ghosts , it appears to enhance or stabilize the targeting , as only Pex14p trails through the gradients of this mutant strain ( Fig. 9 B ) .\n", + "s230 (Discussion,implication) [0] : Discussion\n", + "s231 (Discussion,implication|implication) [0] : The peroxisomal membrane protein Pex14p has been reported to bind both the PTS1 and the PTS2 receptor , which led exLink to the conclusion that Pex14p may represent the point of convergence of the PTS1 - and PTS2-dependent protein import pathways at the peroxisomal membrane .\n", + "s234 (Discussion,implication|implication) [0] : Pex13p is also shown to be required for the peroxisomal association of Pex14p ; however , evidence is provided that the SH3 domain of Pex13p may not represent the only binding site for Pex14p at the peroxisomal membrane .\n", + "s236 (Discussion,result|fact) [0] : The SH3 domain of Pex13p has been reported to interact with the PTS1 receptor Pex5p and with Pex14p ( exLink ; exLink ; exLink ; exLink ; exLink ; Fig. 8 ) .\n", + "s237 (Discussion,fact|result) [X] : A mutation in the RT loop of the SH3 domain of Pex13p , as well as a mutation of a putative class II SH3 ligand motif of Pex14p abolished the two-hybrid interaction of both proteins ( Fig. 8 ) , supporting the notion of a typical SH3 domain-ligand interaction between Pex13p and Pex14p .\n", + "s238 (Discussion,result|result) [0] : Interestingly , although the E320K mutation of the RT loop of the SH3 domain of Pex13p abolishes its two-hybrid interaction with Pex14p , the mutated SH3 domain still interacts with Pex5p ( Fig. 8 B ) .\n", + "s239 (Discussion,implication|implication) [0] : Accordingly , we conclude that there are distinct binding sites for both Pex5p and Pex14p within this domain or adjacent regions contained within the construct used for the assay .\n", + "s240 (Discussion,result) [X] : Remarkably , neither the E320K mutation of the SH3 domain of Pex13p nor the mutation of the proline-rich motif of Pex14p prevented the peroxisomal localization of Pex14p ( Fig. 9 ) .\n", + "s241 (Discussion,implication|implication) [0] : This observation suggests that the binding of Pex14p to the SH3 domain of Pex13p is not absolutely required for the targeting and binding of Pex14p to peroxisomes .\n", + "s242 (Discussion,result) [0] : Why then does the absence of Pex13p lead to the mistargeting of Pex14p ( Fig. 7 ) ?\n", + "s247 (Discussion,hypothesis|implication) [0] : It is true that Pex17p is another binding partner of Pex14p , but our data suggest that Pex17p is not required for association of the Pex13p/ Pex14p/Pex5p/Pex7p complex , as all these components can efficiently coprecipitate in the absence of Pex17p ( Fig. 3 ) .\n", + "s248 (Discussion,result|result) [0] : Moreover , we found no Pex17p in a precipitate from pex14Delta cells that still contains Pex13p and Pex7p ( Fig. 3 ) , leading to two conclusions .\n", + "s252 (Discussion,result|implication) [X] : The amount of Pex7p in the membrane sediment of pex14Delta cells is significantly lower than in wild-type or pex13Delta cells ( Fig. 2 ) , suggesting that Pex14p may contribute to the majority of the total binding capacity of the peroxisomal membrane for the PTS2 receptor .\n", + "s253 (Discussion,result) [X] : However , a significant amount of Pex7p was sedimented in the absence of Pex14p ( Fig. 2 , lane pex14Delta ) .\n", + "s254 (Discussion,result|implication) [X] : Interestingly , in cells lacking both Pex13p and Pex14p , no Pex7p was found in the membrane pellet , which suggests that Pex13p contributed to the remaining Pex7p associated with peroxisomal membranes of pex14Delta cells ( data not shown ) .\n", + "s255 (Discussion,implication|result) [0] : This result , however , has to be interpreted with care since the double deletion of PEX13 and PEX14 did result in a significant decrease in immunologically detectable Pex7p ( Girzalsky , W. , and R. Erdmann , unpublished observations ) .\n", + "s256 (Discussion,result|result) [0] : The observations that Pex13p and Pex7p interact in the two-hybrid system and can be efficiently coimmunoprecipitated indicate that the proteins interact in vivo ( Figs. 3 and 4 ) .\n", + "s257 (Discussion,implication|result) [X] : Whether Pex13p directly binds Pex7p remains to be shown .\n", + "s258 (Discussion,method|result) [0] : Attempts to demonstrate direct binding of the proteins by coimmunoprecipitation of in vitro translated proteins were unsuccessful ( data not shown ) .\n", + "s260 (Discussion,implication|implication) [0] : However , two observations indicate that the hypothetical bridging protein is not one of the known binding partners for Pex13p .\n", + "s261 (Discussion,result|result) [X] : First , the Pex7p/Pex13p interaction is also observed in the absence of these proteins ( Figs. 3 and 4 ) , and second , the COOH-terminal SH3 domain alone is sufficient for the Pex13p/ Pex14p and Pex13p/Pex5p two-hybrid interaction , but not for the interaction of Pex13p with Pex7p ( exLink ) .\n", + "s262 (Discussion,implication) [X] : A direct interaction of Pex13p and Pex7p is further suggested by the genetic suppression of the defect caused by a functionally compromised HA-tagged Pex7p by overexpression of Pex13p ( Fig. 5 ) .\n", + "s263 (Discussion,implication|result) [0] : As discussed above , a Pex5p/Pex7p two-hybrid interaction is not observed in pex14Delta ( Fig. 1 ) .\n", + "s264 (Discussion,implication|result) [X] : At first , this observation seems rather surprising , since both Pex5p and Pex7p independently interact with Pex13p in the two-hybrid system ( Fig. 4 ) .\n", + "s267 (Discussion,implication) [X] : In support of this assumption , the amount of Pex5p coimmunoprecipitating with Pex7p in the absence of Pex14p is extremely reduced , despite the presence of significant amounts of Pex13p ( Fig. 3 , lane pex14Delta ) .\n", + "s268 (Discussion,implication|result) [0] : Perhaps Pex13p does not usually associate simultaneously with both of the import receptors , or association is transient .\n", + "s271 (Discussion,result|result) [0] : One group has reported that the protein is exclusively localized in the peroxisomal lumen ( exLink , exLink ) , whereas others found the protein to be predominantly localized in the cytosol with a small amount associated with the peroxisomal membrane ( exLink ; exLink ) .\n", + "s272 (Discussion,implication|hypothesis) [0] : Because the SH3 domain alone does not mediate the interaction with Pex7p , we suggest that regions NH2-terminal of the SH3 domain may be required for the interaction or contribute to the correct conformation of the binding site .\n", + "s273 (Discussion,result|result) [0] : Previously , the COOH-terminal SH3 domain has been reported to face the cytosol ( exLink ; exLink ) , and we found that both the NH2 terminus and the COOH terminus of Pex13p are exposed to the cytosol ( Fig. 6 ) , suggesting that the protein traverses the membrane with an even number of membrane spans .\n", + "s274 (Discussion,implication|hypothesis) [0] : In this respect , it is interesting to note that two regions which would fulfill the requirement for alpha-helical transmembrane segments are present in Pex13p ( exLink ) .\n", + "s275 (Discussion,fact|implication) [0] : The interaction of Pex13p with Pex7p has far reaching implications for our understanding of protein import into the peroxisomal matrix .\n", + "\n", + "105/279\n", + "25/39\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "from nltk.corpus import stopwords\n", + "import re\n", + "\n", + "inFile = '/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/intact/scidt_fries_bioc_tsv4/10087260.tsv'\n", + "#/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/corpora/pathwayLogic/scidt_bioc_sentences_tsv/11777939.tsv'\n", + "tsv = pd.read_csv(inFile, sep='\\t')\n", + "sentences = []\n", + "\n", + "stopwords = stopwords.words('english')\n", + "regex1 = re.compile(r\"[\\(\\)\\{\\}\\[\\]\\;\\.\\'\\\"\\,\\/\\_\\*]\", re.IGNORECASE)\n", + "regex2 = re.compile(r\"\\s+\", re.IGNORECASE)\n", + "\n", + "allHits = 0\n", + "hits = 0\n", + "j = 0\n", + "for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + " \n", + " j += 1\n", + " if(reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if ('methods' in heading.lower()):\n", + " continue\n", + " \n", + " r = 'X'\n", + " if(reachData != reachData):\n", + " r = '0'\n", + " \n", + " if(reachData == reachData):\n", + " hits += 1\n", + "\n", + " print(sid + ' (' + heading + ',' + discourse + ') ' + '[' + r + '] : ' + text ) \n", + " \n", + " text = re.sub(regex1,\"\",text)\n", + " sent = regex2.split(text)\n", + " sent = [w for w in sent if w not in stopwords and len(w)>0]\n", + " sentences.append(sent)\n", + "\n", + " if 'exLink' in codeStr:\n", + " continue\n", + "\n", + " \n", + "print\n", + "print (str(len(sentences)) + '/' + str(j))\n", + "print (str(hits) + '/' + str(allHits))" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "wv = gensim.models.KeyedVectors.load_word2vec_format(\n", + " \"/Users/Gully/Documents/Projects/2_active/bigMech/work/2017-01-30-ldk_paper/embeddings_pubmed_files/PMC-w2v.bin\",\n", + " binary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "from gensim.models.word2vec import Word2Vec\n", + "\n", + "model = Word2Vec(iter=1) \n", + "model.wv = wv" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Number of sentences = 105\n", + "\n", + "........................................................................................................\n", + ".......................................................................................................\n", + "......................................................................................................\n", + ".....................................................................................................\n", + "....................................................................................................\n", + "...................................................................................................\n", + "..................................................................................................\n", + ".................................................................................................\n", + "................................................................................................\n", + "...............................................................................................\n", + "..............................................................................................\n", + ".............................................................................................\n", + "............................................................................................\n", + "...........................................................................................\n", + "..........................................................................................\n", + ".........................................................................................\n", + "........................................................................................\n", + ".......................................................................................\n", + "......................................................................................\n", + ".....................................................................................\n", + "....................................................................................\n", + "...................................................................................\n", + "..................................................................................\n", + ".................................................................................\n", + "................................................................................\n", + "...............................................................................\n", + "..............................................................................\n", + ".............................................................................\n", + "............................................................................\n", + "...........................................................................\n", + "..........................................................................\n", + ".........................................................................\n", + "........................................................................\n", + ".......................................................................\n", + "......................................................................\n", + ".....................................................................\n", + "....................................................................\n", + "...................................................................\n", + "..................................................................\n", + ".................................................................\n", + "................................................................\n", + "...............................................................\n", + "..............................................................\n", + ".............................................................\n", + "............................................................\n", + "...........................................................\n", + "..........................................................\n", + ".........................................................\n", + "........................................................\n", + ".......................................................\n", + "......................................................\n", + ".....................................................\n", + "....................................................\n", + "...................................................\n", + "..................................................\n", + ".................................................\n", + "................................................\n", + "...............................................\n", + "..............................................\n", + ".............................................\n", + "............................................\n", + "...........................................\n", + "..........................................\n", + ".........................................\n", + "........................................\n", + ".......................................\n", + "......................................\n", + ".....................................\n", + "....................................\n", + "...................................\n", + "..................................\n", + ".................................\n", + "................................\n", + "...............................\n", + "..............................\n", + ".............................\n", + "............................\n", + "...........................\n", + "..........................\n", + ".........................\n", + "........................\n", + ".......................\n", + "......................\n", + ".....................\n", + "....................\n", + "...................\n", + "..................\n", + ".................\n", + "................\n", + "...............\n", + "..............\n", + ".............\n", + "............\n", + "...........\n", + "..........\n", + ".........\n", + "........\n", + ".......\n", + "......\n", + ".....\n", + "....\n", + "...\n", + "..\n", + ".\n" + ] + } + ], + "source": [ + "import sys \n", + "print(\"Number of sentences = {:d}\".format(len(sentences))) \n", + "\n", + "dMatrix=[]\n", + "for i in range(0,len(sentences)):\n", + " row=[]\n", + " dMatrix.append(row)\n", + " sys.stdout.write('\\n')\n", + " for j in range(0,len(sentences)):\n", + " if(ij):\n", + " d = dMatrix[j][i]\n", + " dMatrix[i].append(d)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 88, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "inlinks = []\n", + "outlinks = []\n", + "j=0\n", + "for i, row in tsv.iterrows():\n", + " sid = row['SentenceId']\n", + " codeStr = row['Codes']\n", + " paragraph = row['Paragraph']\n", + " text = row['Sentence Text']\n", + " heading = row['Headings']\n", + " floatingBox = row['FloatingBox?']\n", + " discourse = row['Discourse Type']\n", + " reachData = row['friesEventsTypes']\n", + " \n", + " if(reachData == reachData):\n", + " allHits += 1\n", + "\n", + " if (heading != heading):\n", + " heading = \"\"\n", + "\n", + " if (floatingBox):\n", + " continue\n", + "\n", + " if (('implication' not in discourse) and\n", + " 'result' not in discourse):\n", + " continue\n", + "\n", + " if ('methods' in heading.lower()):\n", + " continue\n", + " \n", + " if 'exLink' in codeStr:\n", + " outlinks.append(j)\n", + " else: \n", + " inlinks.append(j)\n", + " j += 1\n", + " \n", + "ii = []\n", + "io = []\n", + "oo = []\n", + "for i in range(0,len(sentences)):\n", + " for j in range(0,len(sentences)):\n", + " if( i in inlinks and j in inlinks):\n", + " ii.append(dMatrix[i][j])\n", + " elif( i in outlinks and j in outlinks):\n", + " oo.append(dMatrix[i][j])\n", + " else: \n", + " io.append(dMatrix[i][j])" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "ALL mean = 2.91, stdev = 0.71, (105, 105)\n", + "ii mean = 2.89, stdev = 0.73, (7225,)\n", + "io mean = 2.95, stdev = 0.70, (3400,)\n", + "oo mean = 2.92, stdev = 0.41, (400,)\n" + ] + } + ], + "source": [ + "a = np.array(dMatrix)\n", + "a_ii = np.array(ii)\n", + "a_oo = np.array(oo)\n", + "a_io = np.array(io)\n", + "\n", + "print(\"ALL mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a),np.std(a),a.shape))\n", + "print(\"ii mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a_ii),np.std(a_ii),a_ii.shape))\n", + "print(\"io mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a_io),np.std(a_io),a_io.shape))\n", + "print(\"oo mean = {:.2f}, stdev = {:.2f}, {:s}\".format(np.mean(a_oo),np.std(a_oo),a_oo.shape))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 70, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "[ 0. 2.70951278 3.28758438 2.96953717 3.17719459 2.81105121\n", + " 3.35825087 3.46992865 2.29968692 3.29470469 3.05059795 2.63823061\n", + " 2.98713756 2.84195812 3.40902733 2.99265033 2.90544037 2.73017843\n", + " 3.22730077 3.28885195 2.88110962 2.56501906 2.92114793 2.54567875\n", + " 2.7520886 3.26020055 2.69512023 2.69949859 7.02568466 3.0830886\n", + " 2.44424402 2.8323922 3.29172478 2.11977776 2.6251516 3.31446568\n", + " 2.94809315 3.23356727 2.99033033 2.99645548 3.24785576 2.50086568\n", + " 3.34242533 3.31578968 3.32333538 2.51567323 3.1679571 2.77440658\n", + " 3.24562005 2.56683298 2.54401009 2.19415479 2.24739697 3.38171277\n", + " 2.82798443 2.85669571 3.11864001 2.65150638 3.22795487 2.24072686\n", + " 2.75729774 3.07123952 2.79272931 2.66202148 2.08089359 2.70754171\n", + " 2.34900525 2.88431247 2.43004056 3.19577731 2.67694075 2.29094977\n", + " 7.33531289 2.95541549 2.73295739 3.05693131 2.9249465 3.35044991\n", + " 2.68794189 2.74050119 2.66568818 2.89679137 2.74537219 7.22883396\n", + " 2.79979104]\n" + ] + } + ], + "source": [ + "#y = np.arange(35).reshape(5,7)\n", + "print a[a_i,a_i]\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "array([[ 0. , 2.69064874, 2.81680236, ..., 3.16540014,\n", + " 2.83107124, 2.32218492],\n", + " [ 0. , 2.70951278, 2.30072444, ..., 2.44334137,\n", + " 2.72931747, 2.69064874],\n", + " [ 0. , 2.72334771, 3.28758438, ..., 2.72521473,\n", + " 2.81680236, 2.30072444],\n", + " ..., \n", + " [ 0. , 3.0078866 , 3.05008391, ..., 3.30905467,\n", + " 2.79979104, 2.67234395],\n", + " [ 0. , 3.02760509, 2.83107124, ..., 2.6596075 ,\n", + " 2.81466543, 2.79979104],\n", + " [ 0. , 2.32218492, 2.69064874, ..., 3.04252413,\n", + " 2.67234395, 2.79979104]])" + ] + }, + "execution_count": 93, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import numpy as np\n", + "from sklearn.manifold import TSNE\n", + "\n", + "X = np.array(dMatrix)\n", + "model = TSNE(n_components=2, random_state=0)\n", + "np.set_printoptions(suppress=True)\n", + "Xlayout = model.fit_transform(X) \n", + "X" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "from matplotlib import offsetbox\n", + "\n", + "#----------------------------------------------------------------------\n", + "# Scale and visualize the embedding vectors\n", + "def plot_embedding(X, title=None):\n", + " x_min, x_max = np.min(X, 0), np.max(X, 0)\n", + " X = (X - x_min) / (x_max - x_min)\n", + "\n", + " plt.figure()\n", + " ax = plt.subplot(111)\n", + " for i in range(X.shape[0]):\n", + " plt.text(X[i, 0], X[i, 1], str(i),\n", + " fontdict={'weight': 'bold', 'size': 9})\n", + "\n", + " '''\n", + " if hasattr(offsetbox, 'AnnotationBbox'):\n", + " # only print thumbnails with matplotlib > 1.0\n", + " shown_images = np.array([[1., 1.]]) # just something big\n", + " for i in range(digits.data.shape[0]):\n", + " dist = np.sum((X[i] - shown_images) ** 2, 1)\n", + " if np.min(dist) < 4e-3:\n", + " # don't show points that are too close\n", + " continue\n", + " shown_images = np.r_[shown_images, [X[i]]]\n", + " imagebox = offsetbox.AnnotationBbox(\n", + " offsetbox.OffsetImage(digits.images[i], cmap=plt.cm.gray_r),\n", + " X[i])\n", + " ax.add_artist(imagebox) \n", + " '''\n", + " plt.xticks([]), plt.yticks([])\n", + " if title is not None:\n", + " plt.title(title)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'sentences' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mprint\u001b[0m \u001b[0msentences\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;31m#plot_embedding(Xlayout,\"t-SNE embedding of the sentences\")\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;31m#plt.show()\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'sentences' is not defined" + ] + } + ], + "source": [ + "print sentences\n", + "\n", + "#plot_embedding(Xlayout,\"t-SNE embedding of the sentences\")\n", + "#plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Reproducing the demo above" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "('Features:', u'addresses, chicago, illinois, media, obama, president, press, speaks')\n" + ] + } + ], + "source": [ + "d1 = \"Obama speaks to the media in Illinois\"\n", + "d2 = \"The President addresses the press in Chicago\"\n", + "\n", + "vect = CountVectorizer(stop_words=\"english\").fit([d1, d2])\n", + "print(\"Features:\", \", \".join(vect.get_feature_names()))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The two documents are completely orthogonal in terms of bag-of-words" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(array([0, 0, 1, 1, 1, 0, 0, 1]), array([1, 1, 0, 0, 0, 1, 1, 0]))\n", + "cosine(doc_1, doc_2) = 1.00\n" + ] + } + ], + "source": [ + "from scipy.spatial.distance import cosine\n", + "v_1, v_2 = vect.transform([d1, d2])\n", + "v_1 = v_1.toarray().ravel()\n", + "v_2 = v_2.toarray().ravel()\n", + "print(v_1, v_2)\n", + "print(\"cosine(doc_1, doc_2) = {:.2f}\".format(cosine(v_1, v_2)))" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "0.7664012231\n", + "d(addresses, speaks) = 0.33\n", + "d(addresses, chicago) = 0.06\n" + ] + } + ], + "source": [ + "from sklearn.metrics import euclidean_distances\n", + "\n", + "#W_ = W[[vocab_dict[w] for w in vect.get_feature_names()]]\n", + "#D_ = euclidean_distances(W_)\n", + "print(\"d(addresses, speaks) = {:.2f}\".format(wv.similarity('addresses','speaks')))\n", + "print(\"d(addresses, chicago) = {:.2f}\".format(wv.similarity('addresses','chicago')))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We will be using [``pyemd``](https://github.com/wmayner/pyemd), a Python wrapper for [Pele and Werman's implementation of the earth mover's distance](http://www.ariel.ac.il/sites/ofirpele/fastemd/)." + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "ename": "NameError", + "evalue": "name 'D_' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m()\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0mv_1\u001b[0m \u001b[0;34m/=\u001b[0m \u001b[0mv_1\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 7\u001b[0m \u001b[0mv_2\u001b[0m \u001b[0;34m/=\u001b[0m \u001b[0mv_2\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0msum\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 8\u001b[0;31m \u001b[0mD_\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mD_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mastype\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mnp\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdouble\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 9\u001b[0m \u001b[0mD_\u001b[0m \u001b[0;34m/=\u001b[0m \u001b[0mD_\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mmax\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;31m# just for comparison purposes\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 10\u001b[0m \u001b[0;32mprint\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"d(doc_1, doc_2) = {:.2f}\"\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mformat\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0memd\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mv_1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mv_2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mD_\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n", + "\u001b[0;31mNameError\u001b[0m: name 'D_' is not defined" + ] + } + ], + "source": [ + "from pyemd import emd\n", + "\n", + "# pyemd needs double precision input\n", + "v_1 = v_1.astype(np.double)\n", + "v_2 = v_2.astype(np.double)\n", + "v_1 /= v_1.sum()\n", + "v_2 /= v_2.sum()\n", + "D_ = D_.astype(np.double)\n", + "D_ /= D_.max() # just for comparison purposes\n", + "print(\"d(doc_1, doc_2) = {:.2f}\".format(emd(v_1, v_2, D_)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Document classification\n", + "\n", + "We will use the [*20 Newsgroups*](http://qwone.com/~jason/20Newsgroups/) classification task. Because WMD is an expensive computation, for this demo we just use a subset. To emphasize the power of the method, we use a larger test size, but train on relatively few samples." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "newsgroups = fetch_20newsgroups()\n", + "docs, y = newsgroups.data, newsgroups.target" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "docs_train, docs_test, y_train, y_test = train_test_split(docs, y,\n", + " train_size=100,\n", + " test_size=300,\n", + " random_state=0)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Since the `W` embedding array is pretty huge, we might as well restrict it to just the words that actually occur in the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vect = CountVectorizer(stop_words=\"english\").fit(docs_train + docs_test)\n", + "common = [word for word in vect.get_feature_names() if word in vocab_dict]\n", + "W_common = W[[vocab_dict[w] for w in common]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can then create a fixed-vocabulary vectorizer using only the words we have embeddings for." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "vect = CountVectorizer(vocabulary=common, dtype=np.double)\n", + "X_train = vect.fit_transform(docs_train)\n", + "X_test = vect.transform(docs_test)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "One way to proceed is to just pre-compute the pairwise distances between all documents, and use them to search for hyperparameters and evaluate the model. However, that would incur some extra computation, and WMD is expensive. Also, it's not the most pleasant user interface. So we define some scikit-learn compatible estimators for computing the WMD.\n", + "\n", + "**`WordMoversKNN`** subclasses from `KNeighborsClassifier` and overrides the `predict` function to compute the WMD between all training and test samples.\n", + "\n", + "In practice, however, we often don't know what is the best `n_neighbors` to use. Simply wrapping `WordMoversKNN` in a `GridSearchCV` would be rather expensive because of all the distances that would need to be recomputed for every value of `n_neighbors`. So we introduce **`WordMoversKNNCV`**, which, when fitted, performs *cross-validation* to find the best value of `n_neighbors` (under any given evaluation metric), while only computing the WMD once per fold, and only across folds (saving `n_folds * fold_size ** 2` evaluations)." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting word_movers_knn.py\n" + ] + } + ], + "source": [ + "\"\"\"%%file word_movers_knn.py\"\"\"\n", + "\n", + "# Authors: Vlad Niculae, Matt Kusner\n", + "# License: Simplified BSD\n", + "\n", + "import numpy as np\n", + "from sklearn.metrics import euclidean_distances\n", + "from sklearn.externals.joblib import Parallel, delayed\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.utils import check_array\n", + "from sklearn.cross_validation import check_cv\n", + "from sklearn.metrics.scorer import check_scoring\n", + "from sklearn.preprocessing import normalize\n", + "\n", + "from pyemd import emd\n", + "\n", + "\n", + "class WordMoversKNN(KNeighborsClassifier):\n", + " \"\"\"K nearest neighbors classifier using the Word Mover's Distance.\n", + "\n", + " Parameters\n", + " ----------\n", + " \n", + " W_embed : array, shape: (vocab_size, embed_size)\n", + " Precomputed word embeddings between vocabulary items.\n", + " Row indices should correspond to the columns in the bag-of-words input.\n", + "\n", + " n_neighbors : int, optional (default = 5)\n", + " Number of neighbors to use by default for :meth:`k_neighbors` queries.\n", + "\n", + " n_jobs : int, optional (default = 1)\n", + " The number of parallel jobs to run for Word Mover's Distance computation.\n", + " If ``-1``, then the number of jobs is set to the number of CPU cores.\n", + " \n", + " verbose : int, optional\n", + " Controls the verbosity; the higher, the more messages. Defaults to 0.\n", + " \n", + " References\n", + " ----------\n", + " \n", + " Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger\n", + " From Word Embeddings To Document Distances\n", + " The International Conference on Machine Learning (ICML), 2015\n", + " http://mkusner.github.io/publications/WMD.pdf\n", + " \n", + " \"\"\"\n", + " _pairwise = False\n", + "\n", + " def __init__(self, W_embed, n_neighbors=1, n_jobs=1, verbose=False):\n", + " self.W_embed = W_embed\n", + " self.verbose = verbose\n", + " super(WordMoversKNN, self).__init__(n_neighbors=n_neighbors, n_jobs=n_jobs,\n", + " metric='precomputed', algorithm='brute')\n", + "\n", + " def _wmd(self, i, row, X_train):\n", + " \"\"\"Compute the WMD between training sample i and given test row.\n", + " \n", + " Assumes that `row` and train samples are sparse BOW vectors summing to 1.\n", + " \"\"\"\n", + " union_idx = np.union1d(X_train[i].indices, row.indices)\n", + " W_minimal = self.W_embed[union_idx]\n", + " W_dist = euclidean_distances(W_minimal)\n", + " bow_i = X_train[i, union_idx].A.ravel()\n", + " bow_j = row[:, union_idx].A.ravel()\n", + " return emd(bow_i, bow_j, W_dist)\n", + " \n", + " def _wmd_row(self, row, X_train):\n", + " \"\"\"Wrapper to compute the WMD of a row with all training samples.\n", + " \n", + " Assumes that `row` and train samples are sparse BOW vectors summing to 1.\n", + " Useful for parallelization.\n", + " \"\"\"\n", + " n_samples_train = X_train.shape[0]\n", + " return [self._wmd(i, row, X_train) for i in range(n_samples_train)]\n", + "\n", + " def _pairwise_wmd(self, X_test, X_train=None):\n", + " \"\"\"Computes the word mover's distance between all train and test points.\n", + " \n", + " Parallelized over rows of X_test.\n", + " \n", + " Assumes that train and test samples are sparse BOW vectors summing to 1.\n", + " \n", + " Parameters\n", + " ----------\n", + " X_test: scipy.sparse matrix, shape: (n_test_samples, vocab_size)\n", + " Test samples.\n", + " \n", + " X_train: scipy.sparse matrix, shape: (n_train_samples, vocab_size)\n", + " Training samples. If `None`, uses the samples the estimator was fit with.\n", + " \n", + " Returns\n", + " -------\n", + " dist : array, shape: (n_test_samples, n_train_samples)\n", + " Distances between all test samples and all train samples.\n", + " \n", + " \"\"\"\n", + " n_samples_test = X_test.shape[0]\n", + " \n", + " if X_train is None:\n", + " X_train = self._fit_X\n", + "\n", + " dist = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(\n", + " delayed(self._wmd_row)(test_sample, X_train)\n", + " for test_sample in X_test)\n", + "\n", + " return np.array(dist)\n", + "\n", + " def fit(self, X, y):\n", + " \"\"\"Fit the model using X as training data and y as target values\n", + "\n", + " Parameters\n", + " ----------\n", + " X : scipy sparse matrix, shape: (n_samples, n_features)\n", + " Training data. \n", + "\n", + " y : {array-like, sparse matrix}\n", + " Target values of shape = [n_samples] or [n_samples, n_outputs]\n", + "\n", + " \"\"\"\n", + " X = check_array(X, accept_sparse='csr', copy=True)\n", + " X = normalize(X, norm='l1', copy=False)\n", + " return super(WordMoversKNN, self).fit(X, y)\n", + "\n", + " def predict(self, X):\n", + " \"\"\"Predict the class labels for the provided data\n", + " Parameters\n", + " ----------\n", + " X : scipy.sparse matrix, shape (n_test_samples, vocab_size)\n", + " Test samples.\n", + "\n", + " Returns\n", + " -------\n", + " y : array of shape [n_samples]\n", + " Class labels for each data sample.\n", + " \"\"\"\n", + " X = check_array(X, accept_sparse='csr', copy=True)\n", + " X = normalize(X, norm='l1', copy=False)\n", + " dist = self._pairwise_wmd(X)\n", + " return super(WordMoversKNN, self).predict(dist)\n", + " \n", + " \n", + "class WordMoversKNNCV(WordMoversKNN):\n", + " \"\"\"Cross-validated KNN classifier using the Word Mover's Distance.\n", + "\n", + " Parameters\n", + " ----------\n", + " W_embed : array, shape: (vocab_size, embed_size)\n", + " Precomputed word embeddings between vocabulary items.\n", + " Row indices should correspond to the columns in the bag-of-words input.\n", + "\n", + " n_neighbors_try : sequence, optional\n", + " List of ``n_neighbors`` values to try.\n", + " If None, tries 1-5 neighbors.\n", + "\n", + " scoring : string, callable or None, optional, default: None\n", + " A string (see model evaluation documentation) or\n", + " a scorer callable object / function with signature\n", + " ``scorer(estimator, X, y)``.\n", + "\n", + " cv : int, cross-validation generator or an iterable, optional\n", + " Determines the cross-validation splitting strategy.\n", + " Possible inputs for cv are:\n", + " - None, to use the default 3-fold cross-validation,\n", + " - integer, to specify the number of folds.\n", + " - An object to be used as a cross-validation generator.\n", + " - An iterable yielding train/test splits.\n", + " For integer/None inputs, StratifiedKFold is used.\n", + "\n", + " n_jobs : int, optional (default = 1)\n", + " The number of parallel jobs to run for Word Mover's Distance computation.\n", + " If ``-1``, then the number of jobs is set to the number of CPU cores.\n", + "\n", + " verbose : int, optional\n", + " Controls the verbosity; the higher, the more messages. Defaults to 0.\n", + "\n", + " Attributes\n", + " ----------\n", + " cv_scores_ : array, shape (n_folds, len(n_neighbors_try))\n", + " Test set scores for each fold.\n", + "\n", + " n_neighbors_ : int,\n", + " The best `n_neighbors` value found.\n", + "\n", + " References\n", + " ----------\n", + "\n", + " Matt J. Kusner, Yu Sun, Nicholas I. Kolkin, Kilian Q. Weinberger\n", + " From Word Embeddings To Document Distances\n", + " The International Conference on Machine Learning (ICML), 2015\n", + " http://mkusner.github.io/publications/WMD.pdf\n", + " \n", + " \"\"\"\n", + " def __init__(self, W_embed, n_neighbors_try=None, scoring=None, cv=3,\n", + " n_jobs=1, verbose=False):\n", + " self.cv = cv\n", + " self.n_neighbors_try = n_neighbors_try\n", + " self.scoring = scoring\n", + " super(WordMoversKNNCV, self).__init__(W_embed,\n", + " n_neighbors=None,\n", + " n_jobs=n_jobs,\n", + " verbose=verbose)\n", + "\n", + " def fit(self, X, y):\n", + " \"\"\"Fit KNN model by choosing the best `n_neighbors`.\n", + " \n", + " Parameters\n", + " -----------\n", + " X : scipy.sparse matrix, (n_samples, vocab_size)\n", + " Data\n", + " y : ndarray, shape (n_samples,) or (n_samples, n_targets)\n", + " Target\n", + " \"\"\"\n", + " if self.n_neighbors_try is None:\n", + " n_neighbors_try = range(1, 6)\n", + " else:\n", + " n_neighbors_try = self.n_neighbors_try\n", + "\n", + " X = check_array(X, accept_sparse='csr', copy=True)\n", + " X = normalize(X, norm='l1', copy=False)\n", + "\n", + " cv = check_cv(self.cv, X, y)\n", + " knn = KNeighborsClassifier(metric='precomputed', algorithm='brute')\n", + " scorer = check_scoring(knn, scoring=self.scoring)\n", + "\n", + " scores = []\n", + " for train_ix, test_ix in cv:\n", + " dist = self._pairwise_wmd(X[test_ix], X[train_ix])\n", + " knn.fit(X[train_ix], y[train_ix])\n", + " scores.append([\n", + " scorer(knn.set_params(n_neighbors=k), dist, y[test_ix])\n", + " for k in n_neighbors_try\n", + " ])\n", + " scores = np.array(scores)\n", + " self.cv_scores_ = scores\n", + "\n", + " best_k_ix = np.argmax(np.mean(scores, axis=0))\n", + " best_k = n_neighbors_try[best_k_ix]\n", + " self.n_neighbors = self.n_neighbors_ = best_k\n", + "\n", + " return super(WordMoversKNNCV, self).fit(X, y)" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 30.8s\n", + "[Parallel(n_jobs=3)]: Done 34 out of 34 | elapsed: 2.0min finished\n", + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 25.7s\n", + "[Parallel(n_jobs=3)]: Done 33 out of 33 | elapsed: 2.9min finished\n", + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 53.3s\n", + "[Parallel(n_jobs=3)]: Done 33 out of 33 | elapsed: 2.0min finished\n" + ] + }, + { + "data": { + "text/plain": [ + "WordMoversKNNCV(W_embed=memmap([[ 0.04283, -0.01124, ..., -0.05679, -0.00763],\n", + " [ 0.02884, -0.05923, ..., -0.04744, 0.06698],\n", + " ...,\n", + " [ 0.08428, -0.15534, ..., -0.01413, 0.04561],\n", + " [-0.02052, 0.08666, ..., 0.03659, 0.10445]]),\n", + " cv=3, n_jobs=3, n_neighbors_try=range(1, 20), scoring=None,\n", + " verbose=5)" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "knn_cv = WordMoversKNNCV(cv=3,\n", + " n_neighbors_try=range(1, 20),\n", + " W_embed=W_common, verbose=5, n_jobs=3)\n", + "knn_cv.fit(X_train, y_train)" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV score: 0.38\n" + ] + } + ], + "source": [ + "print(\"CV score: {:.2f}\".format(knn_cv.cv_scores_.mean(axis=0).max()))" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[Parallel(n_jobs=3)]: Done 12 tasks | elapsed: 32.2s\n", + "[Parallel(n_jobs=3)]: Done 66 tasks | elapsed: 4.3min\n", + "[Parallel(n_jobs=3)]: Done 156 tasks | elapsed: 12.5min\n", + "[Parallel(n_jobs=3)]: Done 282 tasks | elapsed: 30.5min\n", + "[Parallel(n_jobs=3)]: Done 300 out of 300 | elapsed: 48.9min finished\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Test score: 0.31\n" + ] + } + ], + "source": [ + "print(\"Test score: {:.2f}\".format(knn_cv.score(X_test, y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Comparison with other models\n", + "\n", + "Now let's see how WMD compares with some common approaches, on bag of words features. The most apples-to-apples comparison would be\n", + "K nearest neighbors with a cosine similarity metric. This approach performs worse than using WMD. (All scores are accuracies.)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from sklearn.svm import LinearSVC\n", + "from sklearn.neighbors import KNeighborsClassifier\n", + "from sklearn.grid_search import GridSearchCV" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV score: 0.34\n", + "Test score: 0.22\n" + ] + } + ], + "source": [ + "knn_grid = GridSearchCV(KNeighborsClassifier(metric='cosine', algorithm='brute'),\n", + " dict(n_neighbors=list(range(1, 20))),\n", + " cv=3)\n", + "knn_grid.fit(X_train, y_train)\n", + "print(\"CV score: {:.2f}\".format(knn_grid.best_score_))\n", + "print(\"Test score: {:.2f}\".format(knn_grid.score(X_test, y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Another common method for text classification is the linear support vector machine on bag of words.\n", + "This performs a bit better than vanilla cosine KNN, but worse than using WMD in this setting. In our experience,\n", + "this seems to depend on the amount of training data available." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "CV score: 0.35\n", + "Test score: 0.27\n" + ] + } + ], + "source": [ + "svc_grid = GridSearchCV(LinearSVC(),\n", + " dict(C=np.logspace(-6, 6, 13, base=2)),\n", + " cv=3)\n", + "svc_grid.fit(X_train, y_train)\n", + "print(\"CV score: {:.2f}\".format(svc_grid.best_score_))\n", + "print(\"Test score: {:.2f}\".format(svc_grid.score(X_test, y_test)))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### What have we learned?\n", + "\n", + "WMD is much better at capturing semantic similarity between documents than cosine, due to its ability to generalize to unseen words. The SVM does somewhat better than cosine KNN, but still lacks such out-of-vocabulary generalization. Given enough data, WMD can probably improve this margin, especially using something like metric learning on top.\n", + "\n", + "The exact WMD, as we have used it here, is pretty slow. This code is not optimized as much as it could be, there is potential through caching and using Cython.\n", + "However, a major limitation remains the cost of actually computing the EMD. To scale even higher, exactness can be relaxed by using lower bounds. In our next post, we will compare such optimization strategies, as discussed in [the WMD paper](http://mkusner.github.io/publications/WMD.pdf)." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.13" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}