data-integrations · pbo-cirus · Apr 9, 2020 · Apr 10, 2020 · Apr 30, 2020
diff --git a/README.md b/README.md
@@ -3,7 +3,7 @@
 
 # Overview
 
-Following plugins are available in this repository.
+The following plugins are available in this repository.
 
   * Image Extractor Transform
   * Document Extractor Transform

diff --git a/checkstyle.xml b/checkstyle.xml
@@ -17,7 +17,7 @@
 
 <!DOCTYPE module PUBLIC
     "-//Puppy Crawl//DTD Check Configuration 1.3//EN"
-    "http://www.puppycrawl.com/dtds/configuration_1_3.dtd">
+    "http://checkstyle.sourceforge.net/dtds/configuration_1_3.dtd">
 
 <!-- This is a checkstyle configuration file. For descriptions of
 what the following rules do, please see the checkstyle configuration
@@ -55,7 +55,7 @@ page at http://checkstyle.sourceforge.net/config.html -->
   </module>
 
   <module name="RegexpSingleline">
-    <!-- Checks that TODOs are named with some basic formatting. Checks for the following pattern  TODO: ( 
+    <!-- Checks that TODOs are named with some basic formatting. Checks for the following pattern  TODO: (
     -->
     <property name="format" value="((//.*)|(\*.*))TODO[^: (]" />
     <property name="message" value='All TODOs should be named.  e.g. "TODO: (ENG-123) - Refactor when v2 is released."' />
@@ -70,9 +70,6 @@ page at http://checkstyle.sourceforge.net/config.html -->
   <!-- All Java AST specific tests live under TreeWalker module. -->
   <module name="TreeWalker">
 
-    <!-- required for SupressionCommentFilter and SuppressWithNearbyCommentFilter -->
-    <module name="FileContentsHolder"/>
-
     <!--
 
     IMPORT CHECKS
@@ -114,8 +111,8 @@ page at http://checkstyle.sourceforge.net/config.html -->
       <property name="max" value="300"/>
       <property name="countEmpty" value="false"/>
       <property name="severity" value="warning"/>
-   </module>
-   
+    </module>
+
     <!--
 
     JAVADOC CHECKS
@@ -385,22 +382,4 @@ page at http://checkstyle.sourceforge.net/config.html -->
     </module>
 
   </module>
-
-  <!--
-    Optional suppression filter. It is optional because when running with Maven, it should be the
-     checkstyle plugin who provides it. It is only used when this file is used in IntelliJ.
-    -->
-
-  <module name="SuppressionFilter">
-    <property name="file" value="suppressions.xml"/>
-    <property name="optional" value="true"/>
-  </module>
-
-  <module name="SuppressionCommentFilter">
-    <property name="offCommentFormat" value="CHECKSTYLE OFF: (.+)" />
-    <property name="onCommentFormat" value="CHECKSTYLE ON" />
-    <property name="checkFormat" value="Javadoc.*"/>
-    <property name="messageFormat" value="$1"/>
-  </module>
-
-</module>
+</module>
diff --git a/docs/DocumentExtractor-transform.md b/docs/DocumentExtractor-transform.md
@@ -2,8 +2,8 @@
 
 Description
 -----------
-This transform plugin can detect and transcribe text from small(up to 5 pages) PDF and TIFF files stored in Cloud 
-Storage in an online manner.
+This transform plugin uses the Google Cloud Vision API to detect and transcribe text from (up to 5 pages) documents 
+(.pdf, .tiff, .gif) files stored in Google Cloud Storage.
 
 Credentials
 -----------
@@ -12,8 +12,9 @@ provided and can be set to 'auto-detect'.
 Credentials will be automatically read from the cluster environment.
 
 If the plugin is not run on a Dataproc cluster, the path to a service account key must be provided.
-The service account key can be found on the Dashboard in the Cloud Platform Console.
-Make sure the account key has permission to access Google Cloud Vision.
+The service account key can be found on the Dashboard in the Google Cloud Platform Console.
+
+Make sure the account key has permission to access the Google Cloud Vision API.
 The service account key file needs to be available on every node in your cluster and
 must be readable by all users running the job.
 
@@ -27,22 +28,24 @@ When running on other clusters, the file must be present on every node in the cl
 **Project ID**: Google Cloud Project ID, which uniquely identifies a project. It can be found on the Dashboard in the
 Google Cloud Platform Console.
 
-**Path Field**: Field in the input schema containing the path to the image.
+**Path Field**: Name of the field in the input schema containing the path to the image.
+
+**Content Field**: Name of the field in the input schema containing the file content, represented as a stream of bytes.
 
-**Output Field**: Field to store the extracted image features. If the specified output field name already exists in the
-input record, it will be overwritten.
+Either 'Path Field' or 'Content Field' must be not null. They are mutually exclusive and cannot be both specified.
 
-**Content Field**: Field in the input schema containing the file content, represented as a stream of bytes.
+**Output Field**: Name of the field to store the extracted image features into. If the specified output field name 
+already exists in the input record, it will be overwritten.
 
-**Mime Type**: The type of the file. Currently only 'application/pdf', 'image/tiff' and 'image/gif' are supported.
-Wildcards are not supported.
+**Mime Type**: The type of the file(s) that will be processed. Currently only 'application/pdf', 'image/tiff' and 
+'image/gif' are supported. Wildcards are not supported.
 
-**Pages**: The pages in the file to perform image annotation.
+**Pages**: The list of pages in the file(s) to perform image annotation on. Enter the list as Comma Separated Values.
 
-**Features**: Features to extract from images.
+**Features**: Features to extract from the documents.
 
-**Language Hints**: Hints to detect the language of the text in the images.
+**Language Hints**: Hints to detect the language of the text in the documents.
 
 **Aspect Ratios**: Ratio of the width to the height of the image. If not specified, the best possible crop is returned.
 
-**Include Geo Results**: Whether to include results derived from the geo information in the image.
+**Include Geo Results**: Whether to include results derived from the geo information in the document.
diff --git a/docs/ImageExtractor-transform.md b/docs/ImageExtractor-transform.md
@@ -27,10 +27,11 @@ When running on other clusters, the file must be present on every node in the cl
 **Project ID**: Google Cloud Project ID, which uniquely identifies a project. It can be found on the Dashboard in the
 Google Cloud Platform Console.
 
-**Path Field**: Field in the input schema containing the path to the image.
+**Path Field**: Name of the field in the input schema containing the path to images in a Goggle Cloud Storage bucket.
+Usually this is 'body' when the source is reading a file that contains one path per line.
 
-**Output Field**: Field to store the extracted image features. If the specified output field name already exists in the
-input record, it will be overwritten.
+**Output Field**: Name of the field to store the extracted image features. If the specified output field name already
+exists in the input record, it will be overwritten.
 
 **Features**: Features to extract from images.
 

diff --git a/docs/TextExtractorOffline-action.md → docs/OfflineTextExtractor-action.md b/docs/TextExtractorOffline-action.md → docs/OfflineTextExtractor-action.md