Skip to content

Commit

Permalink
Updated UpdateTextPosition notebook
Browse files Browse the repository at this point in the history
  • Loading branch information
kolia1985 committed Aug 11, 2020
1 parent 104a542 commit 1621031
Showing 1 changed file with 42 additions and 21 deletions.
63 changes: 42 additions & 21 deletions jupyter/SparkOcrUpdateTextPosition.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
},
{
"cell_type": "code",
"execution_count": 1,
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -36,7 +36,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -51,7 +51,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -65,9 +65,20 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: spark-nlp==2.5.5 in /usr/local/lib/python3.7/site-packages (2.5.5)\n",
"\u001b[33mWARNING: You are using pip version 19.3.1; however, version 20.2.1 is available.\n",
"You should consider upgrading via the 'pip install --upgrade pip' command.\u001b[0m\n",
"Note: you may need to restart the kernel to use updated packages.\n"
]
}
],
"source": [
"# install from PYPI using secret\n",
"%pip install spark-nlp==2.5.5\n",
Expand All @@ -93,14 +104,14 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"SparkConf Configured, Starting to listen on port: 59744\n",
"SparkConf Configured, Starting to listen on port: 53378\n",
"JAR PATH:/usr/local/lib/python3.7/site-packages/sparkmonitor/listener.jar\n"
]
},
Expand All @@ -114,11 +125,11 @@
" <div>\n",
" <p><b>SparkContext</b></p>\n",
"\n",
" <p><a href=\"http://melnyks-mbp:4043\">Spark UI</a></p>\n",
" <p><a href=\"http://kolia-mbp.dlink:4041\">Spark UI</a></p>\n",
"\n",
" <dl>\n",
" <dt>Version</dt>\n",
" <dd><code>v2.4.4</code></dd>\n",
" <dd><code>v2.3.2</code></dd>\n",
" <dt>Master</dt>\n",
" <dd><code>local[*]</code></dd>\n",
" <dt>AppName</dt>\n",
Expand All @@ -130,10 +141,10 @@
" "
],
"text/plain": [
"<pyspark.sql.session.SparkSession at 0x10c27d2d0>"
"<pyspark.sql.session.SparkSession at 0x1195bb510>"
]
},
"execution_count": 2,
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
Expand All @@ -150,7 +161,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -170,7 +181,7 @@
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -193,7 +204,7 @@
" .setOutputCol(\"spell\")\n",
" \n",
" tokenAssem = TokenAssembler() \\\n",
" .setInputCols(\"spell\") \\\n",
" .setInputCols([\"spell\", \"document\"]) \\\n",
" .setOutputCol(\"newDocs\")\n",
"\n",
" updatedText = UpdateTextPosition() \\\n",
Expand Down Expand Up @@ -248,7 +259,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -266,9 +277,19 @@
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 20,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"spellcheck_norvig download started this may take some time.\n",
"Approximate size to download 4.2 MB\n",
"[OK!]\n"
]
}
],
"source": [
"ocr_result = ocr_pipeline().fit(pdf_example_df).transform(pdf_example_df)\n",
"updated_result= update_text_pipeline().fit(ocr_result).transform(ocr_result)\n",
Expand All @@ -288,7 +309,7 @@
},
{
"cell_type": "code",
"execution_count": 9,
"execution_count": 21,
"metadata": {
"pycharm": {
"name": "#%%\n"
Expand All @@ -298,10 +319,10 @@
{
"data": {
"text/plain": [
"72914"
"1671"
]
},
"execution_count": 9,
"execution_count": 21,
"metadata": {},
"output_type": "execute_result"
}
Expand Down Expand Up @@ -344,4 +365,4 @@
},
"nbformat": 4,
"nbformat_minor": 2
}
}

1 comment on commit 1621031

@review-notebook-app
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Review Jupyter notebook diffs for this commit on  ReviewNB

You can open a pull request to discuss changes and offer feedback.


Powered by ReviewNB

Please sign in to comment.