diff --git a/README.md b/README.md index 4655a4f..372b014 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ # Overview -Following plugins are available in this repository. +The following plugins are available in this repository. * Image Extractor Transform * Document Extractor Transform diff --git a/checkstyle.xml b/checkstyle.xml index dca5d9d..48560de 100644 --- a/checkstyle.xml +++ b/checkstyle.xml @@ -55,7 +55,7 @@ page at http://checkstyle.sourceforge.net/config.html --> - @@ -115,7 +115,7 @@ page at http://checkstyle.sourceforge.net/config.html --> - + + - io.cdap.plugin - hydrator-common - ${hydrator.version} + com.google.protobuf + protobuf-java + ${google.protobuf.version} - org.apache.hadoop - hadoop-common - ${hadoop.version} - provided + com.google.cloud + google-cloud-vision + ${google.cloud.vision.version} - commons-logging - commons-logging - - - log4j - log4j - - - org.slf4j - slf4j-log4j12 - - - org.apache.avro - avro + com.google.code.findbugs + jsr305 - org.apache.zookeeper - zookeeper + javax.annotation + javax.annotation-api - com.google.guava - guava + org.apache.hbase + hbase-shaded-client - jersey-core - com.sun.jersey - - - jersey-json - com.sun.jersey - - - jersey-server - com.sun.jersey - - - servlet-api - javax.servlet - - - org.mortbay.jetty - jetty - - - org.mortbay.jetty - jetty-util - - - jasper-compiler - tomcat - - - jasper-runtime - tomcat - - - jsp-api - javax.servlet.jsp - - - slf4j-api - org.slf4j - - - - - org.apache.hadoop - hadoop-mapreduce-client-core - ${hadoop.version} - provided - - - org.slf4j - slf4j-log4j12 + org.apache.hbase + hbase-shaded-server - com.google.inject.extensions - guice-servlet - - - com.sun.jersey - jersey-core + org.apache.avro + avro - com.sun.jersey - jersey-server + com.google.api.grpc + proto-google-cloud-vision-v1p1beta1 - com.sun.jersey - jersey-json + com.google.api.grpc + proto-google-cloud-vision-v1p3beta1 - com.sun.jersey.contribs - jersey-guice + com.google.api.grpc + proto-google-cloud-vision-v1p2beta1 - javax.servlet - servlet-api + com.google.api.grpc + proto-google-cloud-vision-v1p4beta1 com.google.guava guava - com.google.inject - guice + com.google.protobuf + protobuf-java - - com.google.protobuf - protobuf-java - ${google.protobuf.version} - - - com.google.cloud - google-cloud-vision - ${google.cloud.vision.version} - com.google.cloud - google-cloud-vision - ${google.cloud.vision.version} - - - com.google.guava - guava - ${guava.version} - - - com.google.guava - guava - ${guava.version} - - - commons-codec - commons-codec - ${common.codec.version} - - - com.google.code.gson - gson - ${gson.version} + google-cloud-storage + ${google.cloud.storage.version} + + + - com.fasterxml.jackson.core - jackson-databind - ${jackson.core.version} + io.cdap.plugin + google-cloud + ${google.cloud.version} + test - io.cdap.cdap hydrator-test @@ -309,31 +221,16 @@ ${junit.version} test - - org.mockito - mockito-all - ${mockito.version} - test - + org.apache.httpcomponents httpclient ${httpcomponents.version} test - - com.github.tomakehurst - wiremock - ${wiremock.version} - test - - - - - org.apache.maven.plugins @@ -352,7 +249,7 @@ -Xmx3g -Djava.awt.headless=true -XX:MaxPermSize=256m -XX:+UseConcMarkSweepGC -Djava.net.preferIPv4Stack=true - ${surefire.redirectTestOutputToFile} + ${project.surefire.redirectTestOutputToFile} false plain @@ -405,6 +302,7 @@ LICENSE*.txt + **/*.txt *.rst *.md **/*.cdap @@ -458,7 +356,7 @@ com.puppycrawl.tools checkstyle - 8.18 + 6.19 @@ -482,6 +380,29 @@ + + org.apache.maven.plugins + maven-enforcer-plugin + 3.0.0-M3 + + + enforce-version + + enforce + + + + + + + log4j:log4j:[0.0,1.2.17) + + + + + + + org.apache.felix maven-bundle-plugin @@ -490,7 +411,22 @@ <_exportcontents> - io.cdap.plugin.cloud.vision.* + com.google.api,com.google.api.client.*,com.google.api.core,com.google.api.gax.batching, + com.google.api.gax.core,com.google.api.gax.grpc,com.google.api.gax.longrunning, + com.google.api.gax.paging,com.google.api.gax.retrying,com.google.api.gax.rpc, + com.google.api.gax.tracing,com.google.api.resourcenames, + com.google.api.services.storage.model,com.google.auth.*,com.google.cloud, + com.google.cloud.http,com.google.cloud.spi,com.google.cloud.storage, + com.google.cloud.storage.spi,com.google.cloud.storage.spi.v1,com.google.cloud.vision.v1.*, + com.google.common.base,com.google.common.collect,com.google.common.graph, + com.google.common.hash,com.google.common.io,com.google.common.util.concurrent, + com.google.common.util.concurrent.internal,com.google.errorprone.annotations, + com.google.iam.v1,com.google.longrunning,com.google.longrunning.stub,com.google.protobuf.*, + com.google.rpc,com.google.type,io.cdap.plugin.cloud.vision.*,io.grpc,io.grpc.stub, + io.opencensus.common,io.opencensus.trace,io.opencensus.trace.config, + io.opencensus.trace.export,io.opencensus.trace.propagation, + org.checkerframework.checker.nullness.compatqual,org.checkerframework.checker.nullness.qual, + org.checkerframework.framework.qual,org.threeten.bp,org.threeten.bp.* *;inline=false;scope=compile true diff --git a/src/main/java/io/cdap/plugin/cloud/vision/CloudVisionConfig.java b/src/main/java/io/cdap/plugin/cloud/vision/CloudVisionConfig.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/CloudVisionConstants.java b/src/main/java/io/cdap/plugin/cloud/vision/CloudVisionConstants.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/CredentialsHelper.java b/src/main/java/io/cdap/plugin/cloud/vision/CredentialsHelper.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/ActionConstants.java b/src/main/java/io/cdap/plugin/cloud/vision/action/ActionConstants.java old mode 100644 new mode 100755 index 3c9e84d..64a14eb --- a/src/main/java/io/cdap/plugin/cloud/vision/action/ActionConstants.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/action/ActionConstants.java @@ -19,40 +19,46 @@ /** * Cloud Vision Action constants. */ -public class ActionConstants { +public final class ActionConstants { /** * Configuration property name used to specify source path. */ public static final String SOURCE_PATH = "sourcePath"; - /** * Configuration property name used to specify destination path. */ public static final String DESTINATION_PATH = "destinationPath"; - /** * Configuration property name used to specify batch size. */ public static final String BATCH_SIZE = "batchSize"; - /** * Configuration property name used to specify the features to extract from images. */ public static final String FEATURES = "features"; - /** * Configuration property name used to specify optional hints. */ public static final String LANGUAGE_HINTS = "languageHints"; - /** * Configuration property name used to specify aspect ratios. */ public static final String ASPECT_RATIOS = "aspectRatios"; - /** * Configuration property name used to specify includeGeoResults. */ public static final String INCLUDE_GEO_RESULTS = "includeGeoResults"; + /** + * Maximum number of Images that can be send at a time to the Cloud Vision API + * + * @see bath information + */ + public static final int MAX_NUMBER_OF_IMAGES_PER_BATCH = 2000; + + /** + * Prevent instantiation. + */ + private ActionConstants() { + } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/GcsBucketHelper.java b/src/main/java/io/cdap/plugin/cloud/vision/action/GcsBucketHelper.java new file mode 100644 index 0000000..f9b2a77 --- /dev/null +++ b/src/main/java/io/cdap/plugin/cloud/vision/action/GcsBucketHelper.java @@ -0,0 +1,63 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.cloud.vision.action; + +import com.google.auth.Credentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.storage.Bucket; +import com.google.cloud.storage.Storage; +import com.google.cloud.storage.StorageOptions; +import java.util.ArrayList; +import java.util.List; + + +/** + * Helper class used to retrieve a list of Blobs from a GCSPath. + */ + +public class GcsBucketHelper { + /** + * Helper function that returns a List of Blobs that are found in a given path on GCS. + * + * @param sourceFolderPath + * @param credentials + * @return List of Blobs + */ + public static List getAllFilesInPath(String sourceFolderPath, Credentials credentials) { + List results = new ArrayList<>(); + + Storage storage = StorageOptions.newBuilder().setCredentials(credentials) + .build().getService(); + + // Loop through the buckets + for (Bucket currentBucket : storage.list().iterateAll()) { + // Loop though the blobs and check if the path is what we want, based on sourceFolderPath + for (Blob blob : currentBucket.list().iterateAll()) { + if (blob.getName().endsWith("/")) { // It's a "folder" + continue; // Ignore + } + // Rebuild the full path of the blob + String fullBlobPath = "gs://" + blob.getBucket() + "/" + blob.getName(); + // Is the blob part of the path we have been given? + if (fullBlobPath.startsWith(sourceFolderPath)) { + results.add(blob); + } + } + } + return results; + } +} diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorAction.java b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorAction.java old mode 100644 new mode 100755 index a8da8a3..ac5e803 --- a/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorAction.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorAction.java @@ -18,6 +18,7 @@ import com.google.api.gax.core.FixedCredentialsProvider; import com.google.auth.Credentials; +import com.google.cloud.storage.Blob; import com.google.cloud.vision.v1.AnnotateImageRequest; import com.google.cloud.vision.v1.AsyncBatchAnnotateImagesRequest; import com.google.cloud.vision.v1.CropHintsParams; @@ -39,23 +40,31 @@ import io.cdap.cdap.etl.api.action.Action; import io.cdap.cdap.etl.api.action.ActionContext; import io.cdap.plugin.cloud.vision.CredentialsHelper; - -import java.util.Arrays; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; import java.util.List; import javax.annotation.Nullable; +import static io.cdap.plugin.cloud.vision.action.ActionConstants.MAX_NUMBER_OF_IMAGES_PER_BATCH; /** * Action that runs offline image extractor. */ @Plugin(type = Action.PLUGIN_TYPE) -@Name(OfflineImageExtractorAction.NAME) +@Name(OfflineImageExtractorAction.PLUGIN_NAME) @Description("Action that runs offline image extractor.") public class OfflineImageExtractorAction extends Action { - public static final String NAME = "OfflineImageExtractor"; - + public static final String PLUGIN_NAME = "OfflineImageExtractor"; private final OfflineImageExtractorActionConfig config; + private static final Logger LOG = LoggerFactory.getLogger(OfflineImageExtractorAction.class); public OfflineImageExtractorAction(OfflineImageExtractorActionConfig config) { + if (config.getSourcePath() != null) { + config.setSourcePath(config.getSourcePath().trim()); // Remove whitespace + } + if (config.getDestinationPath() != null) { + config.setDestinationPath(config.getDestinationPath().trim()); // Remove whitespace + } this.config = config; } @@ -63,6 +72,7 @@ public OfflineImageExtractorAction(OfflineImageExtractorActionConfig config) { public void configurePipeline(PipelineConfigurer pipelineConfigurer) { FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector(); config.validate(collector); + collector.getOrThrowException(); } @Override @@ -73,53 +83,88 @@ public void run(ActionContext actionContext) throws Exception { Credentials credentials = CredentialsHelper.getCredentials(config.getServiceFilePath()); - ImageAnnotatorSettings imageAnnotatorSettings = ImageAnnotatorSettings.newBuilder() - .setCredentialsProvider(FixedCredentialsProvider.create(credentials)) + // Destination in GCS where the results will be stored + String destinationPath = config.getDestinationPath(); + // Add a '/' at the end if it's not already there + if (!destinationPath.endsWith("/")) { + destinationPath += "/"; + } + GcsDestination gcsDestination = GcsDestination.newBuilder() + .setUri(destinationPath) .build(); - try (ImageAnnotatorClient imageAnnotatorClient = ImageAnnotatorClient.create(imageAnnotatorSettings)) { - - ImageSource source = ImageSource.newBuilder().setImageUri(config.getSourcePath()).build(); - Image image = Image.newBuilder().setSource(source).build(); + OutputConfig outputConfig = OutputConfig.newBuilder() + .setGcsDestination(gcsDestination) + .setBatchSize(config.getBatchSizeValue()) + .build(); - List features = - Arrays.asList(Feature.newBuilder().setType(config.getImageFeature().getFeatureType()).build()); - AnnotateImageRequest.Builder builder = - AnnotateImageRequest.newBuilder().setImage(image).addAllFeatures(features); + ImageAnnotatorSettings imageAnnotatorSettings = ImageAnnotatorSettings.newBuilder() + .setCredentialsProvider(FixedCredentialsProvider.create(credentials)) + .build(); - ImageContext imageContext = getImageContext(); - if (imageContext != null) { - builder.setImageContext(imageContext); - } + // Get all the blobs in the source path + List blobs = GcsBucketHelper.getAllFilesInPath(config.getSourcePath(), credentials); + if (blobs.isEmpty()) { + LOG.warn("Nothing found to process in path: " + config.getSourcePath()); + return; + } - AnnotateImageRequest requestsElement = builder.build(); - List requests = Arrays.asList(requestsElement); - GcsDestination gcsDestination = GcsDestination.newBuilder().setUri(config.getDestinationPath()).build(); + // Prepare the list of requests + List imageRequests = new ArrayList<>(blobs.size()); - // The max number of responses to output in each JSON file - int batchSize = config.getBatchSizeValue(); - OutputConfig outputConfig = - OutputConfig.newBuilder() - .setGcsDestination(gcsDestination) - .setBatchSize(batchSize) - .build(); + // Feature we are going to ask for + Feature feature = Feature.newBuilder() + .setType(config.getImageFeature().getFeatureType()) + .build(); - AsyncBatchAnnotateImagesRequest asyncRequest = - AsyncBatchAnnotateImagesRequest.newBuilder() - .addAllRequests(requests) + try (ImageAnnotatorClient imageAnnotatorClient = ImageAnnotatorClient.create(imageAnnotatorSettings)) { + // Create batches of images to send for processing + // We need to do this because there is a limit on the vision API that will raise an error if there are more + // than MAX_NUMBER_OF_IMAGES_PER_BATCH in a single batch + for (int batchId = 0; + batchId < (1 + blobs.size() / MAX_NUMBER_OF_IMAGES_PER_BATCH); + batchId++) { + for (int index = batchId * MAX_NUMBER_OF_IMAGES_PER_BATCH; + (index < (batchId + 1) * MAX_NUMBER_OF_IMAGES_PER_BATCH) && (index < blobs.size()); + index++) { + Blob blob = blobs.get(index); + // Rebuild the full path of the blob + String fullBlobPath = "gs://" + blob.getBucket() + "/" + blob.getName(); + + ImageSource imageSource = ImageSource.newBuilder().setImageUri(fullBlobPath).build(); + + Image image = Image.newBuilder().setSource(imageSource).build(); + + AnnotateImageRequest.Builder builder = + AnnotateImageRequest.newBuilder().setImage(image).addFeatures(feature); + + ImageContext imageContext = getImageContext(); + if (imageContext != null) { + builder.setImageContext(imageContext); + } + + AnnotateImageRequest annotateImageRequest = builder.build(); + imageRequests.add(annotateImageRequest); + } + + // Send the requests + AsyncBatchAnnotateImagesRequest asyncRequest = AsyncBatchAnnotateImagesRequest.newBuilder() + .addAllRequests(imageRequests) .setOutputConfig(outputConfig) .build(); - imageAnnotatorClient.asyncBatchAnnotateImagesAsync(asyncRequest) - .getInitialFuture() - .get(); + // Wait for the future to complete + imageAnnotatorClient.asyncBatchAnnotateImagesAsync(asyncRequest) + .getInitialFuture() + .get(); + } } catch (Exception exception) { throw new IllegalStateException(exception); } } @Nullable - private ImageContext getImageContext() { + protected ImageContext getImageContext() { switch (config.getImageFeature()) { case TEXT: return Strings.isNullOrEmpty(config.getLanguageHints()) ? null @@ -138,5 +183,4 @@ private ImageContext getImageContext() { return null; } } - } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfig.java b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfig.java old mode 100644 new mode 100755 index 9fa674c..89cddd1 --- a/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfig.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfig.java @@ -24,7 +24,6 @@ import io.cdap.cdap.etl.api.FailureCollector; import io.cdap.plugin.cloud.vision.CloudVisionConstants; import io.cdap.plugin.cloud.vision.transform.ImageFeature; - import java.util.Arrays; import java.util.Collections; import java.util.List; @@ -33,7 +32,7 @@ import javax.annotation.Nullable; /** - * Configuration for OfflineImageExtractorAction + * Configuration for OfflineImageExtractorAction. */ public class OfflineImageExtractorActionConfig extends PluginConfig { @@ -86,8 +85,17 @@ public OfflineImageExtractorActionConfig(@Nullable String serviceFilePath, Strin @Nullable Boolean includeGeoResults, String sourcePath, String destinationPath, @Nullable String batchSize) { this.serviceFilePath = serviceFilePath; + + if (sourcePath != null) { + sourcePath = sourcePath.trim(); + } this.sourcePath = sourcePath; + + if (destinationPath != null) { + destinationPath = destinationPath.trim(); + } this.destinationPath = destinationPath; + this.features = features; this.batchSize = batchSize; this.languageHints = languageHints; @@ -97,8 +105,8 @@ public OfflineImageExtractorActionConfig(@Nullable String serviceFilePath, Strin private OfflineImageExtractorActionConfig(Builder builder) { serviceFilePath = builder.serviceFilePath; - sourcePath = builder.sourcePath; - destinationPath = builder.destinationPath; + sourcePath = builder.sourcePath.trim(); + destinationPath = builder.destinationPath.trim(); features = builder.features; batchSize = builder.batchSize; languageHints = builder.languageHints; @@ -106,10 +114,21 @@ private OfflineImageExtractorActionConfig(Builder builder) { includeGeoResults = builder.includeGeoResults; } + /** + * Getter to retrieve a Builder object. + * + * @return a Builder object. + */ public static Builder builder() { return new Builder(); } + /** + * Helper function to get a Builder object based on an existing configuration. + * + * @param copy Configuration object to use as the source to copy from. + * @return Builer object. + */ public static Builder builder(OfflineImageExtractorActionConfig copy) { return builder() .setServiceFilePath(copy.getServiceFilePath()) @@ -175,6 +194,12 @@ public boolean getIncludeGeoResults() { return includeGeoResults != null ? includeGeoResults : false; } + /** + * Slit a comma separated list of properties and turn them into a List. + * + * @param property Comma separated list of properties. + * @return A list of strings after splitting on commas. + */ private List convertPropertyToList(String property) { if (!Strings.isNullOrEmpty(property)) { return Arrays.asList(property.split(",")); @@ -183,6 +208,11 @@ private List convertPropertyToList(String property) { } } + /** + * Validate that the configuration is correct. If not, use the FailureCollector object passed to report errors. + * + * @param collector FailureCollector object to use to report errors. + */ public void validate(FailureCollector collector) { ImageFeature feature = getImageFeature(); if (feature == null) { @@ -196,7 +226,7 @@ public void validate(FailureCollector collector) { batch = Integer.parseInt(batchSize); } catch (NumberFormatException e) { collector.addFailure(String.format("Incorrect value '%s' for Batch Size.", batchSize), - "Provide correct value.") + "Provide correct value.") .withConfigProperty(ActionConstants.BATCH_SIZE); return; } @@ -219,8 +249,16 @@ public void validate(FailureCollector collector) { } } + public void setSourcePath(String sourcePath) { + this.sourcePath = sourcePath; + } + + public void setDestinationPath(String destinationPath) { + this.destinationPath = destinationPath; + } + /** - * Builder for creating a {@link OfflineImageExtractorActionConfig} + * Builder for creating a {@link OfflineImageExtractorActionConfig}. */ public static final class Builder { @Nullable @@ -241,12 +279,20 @@ private Builder() { } public Builder setSourcePath(String sourcePath) { - this.sourcePath = sourcePath; + if (sourcePath != null) { + this.sourcePath = sourcePath.trim(); + } else { + this.sourcePath = sourcePath; + } return this; } public Builder setDestinationPath(String destinationPath) { - this.destinationPath = destinationPath; + if (destinationPath != null) { + this.destinationPath = destinationPath.trim(); + } else { + this.destinationPath = destinationPath; + } return this; } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorAction.java b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorAction.java new file mode 100644 index 0000000..6d9ea7a --- /dev/null +++ b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorAction.java @@ -0,0 +1,168 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.cloud.vision.action; + +import com.google.api.gax.core.FixedCredentialsProvider; +import com.google.auth.Credentials; +import com.google.cloud.storage.Blob; +import com.google.cloud.vision.v1.AsyncAnnotateFileRequest; +import com.google.cloud.vision.v1.AsyncBatchAnnotateFilesRequest; +import com.google.cloud.vision.v1.Feature; +import com.google.cloud.vision.v1.GcsDestination; +import com.google.cloud.vision.v1.GcsSource; +import com.google.cloud.vision.v1.ImageAnnotatorClient; +import com.google.cloud.vision.v1.ImageAnnotatorSettings; +import com.google.cloud.vision.v1.ImageContext; +import com.google.cloud.vision.v1.InputConfig; +import com.google.cloud.vision.v1.OutputConfig; +import io.cdap.cdap.api.annotation.Description; +import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.annotation.Plugin; +import io.cdap.cdap.etl.api.FailureCollector; +import io.cdap.cdap.etl.api.PipelineConfigurer; +import io.cdap.cdap.etl.api.action.Action; +import io.cdap.cdap.etl.api.action.ActionContext; +import io.cdap.plugin.cloud.vision.CredentialsHelper; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import java.util.ArrayList; +import java.util.List; +import static io.cdap.plugin.cloud.vision.action.ActionConstants.MAX_NUMBER_OF_IMAGES_PER_BATCH; + +/** + * Action that runs offline document text extractor. + */ +@Plugin(type = Action.PLUGIN_TYPE) +@Name(OfflineTextExtractorAction.PLUGIN_NAME) +@Description("Action that runs offline document text extractor") +public class OfflineTextExtractorAction extends Action { + public static final String PLUGIN_NAME = "OfflineTextExtractor"; + private final OfflineTextExtractorActionConfig config; + private static final Logger LOG = LoggerFactory.getLogger(OfflineTextExtractorAction.class); + + public OfflineTextExtractorAction(OfflineTextExtractorActionConfig config) { + if (config.getSourcePath() != null) { + config.setSourcePath(config.getSourcePath().trim()); // Remove whitespace + } + if (config.getDestinationPath() != null) { + config.setDestinationPath(config.getDestinationPath().trim()); + } + this.config = config; + } + + @Override + public void configurePipeline(PipelineConfigurer pipelineConfigurer) { + FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector(); + config.validate(collector); + collector.getOrThrowException(); + } + + @Override + public void run(ActionContext actionContext) throws Exception { + FailureCollector collector = actionContext.getFailureCollector(); + config.validate(collector); + collector.getOrThrowException(); + + Credentials credentials = CredentialsHelper.getCredentials(config.getServiceAccountFilePath()); + + // Destination in GCS where the results will be stored + String destinationPath = config.getDestinationPath(); + // Add a / at the end if it's not already there + if (!destinationPath.endsWith("/")) { + destinationPath += "/"; + } + GcsDestination gcsDestination = GcsDestination.newBuilder() + .setUri(destinationPath) + .build(); + + OutputConfig outputConfig = OutputConfig.newBuilder() + .setBatchSize(config.getBatchSize()) + .setGcsDestination(gcsDestination) + .build(); + + ImageAnnotatorSettings imageAnnotatorSettings = ImageAnnotatorSettings.newBuilder() + .setCredentialsProvider(FixedCredentialsProvider.create(credentials)) + .build(); + + // Get all the blobs in the source path + List blobs = GcsBucketHelper.getAllFilesInPath(config.getSourcePath(), credentials); + if (blobs.isEmpty()) { + LOG.warn("Nothing found to process in path: " + config.getSourcePath()); + return; + } + + // Prepare the list of requests + List requests = new ArrayList<>(blobs.size()); + + // Feature we are going to ask for + Feature feature = Feature.newBuilder() + .setType(Feature.Type.DOCUMENT_TEXT_DETECTION) + .build(); + + try (ImageAnnotatorClient client = ImageAnnotatorClient.create(imageAnnotatorSettings)) { + // Create batches of images to send for processing + // We need to do this because there is a limit on the vision API that will raise an error if there are more + // than MAX_NUMBER_OF_IMAGES_PER_BATCH in a single batch + for (int batchId = 0; + batchId < (1 + blobs.size() / MAX_NUMBER_OF_IMAGES_PER_BATCH); + batchId++) { + for (int index = batchId * MAX_NUMBER_OF_IMAGES_PER_BATCH; + (index < (batchId + 1) * MAX_NUMBER_OF_IMAGES_PER_BATCH) && (index < blobs.size()); + index++) { + Blob blob = blobs.get(index); + // Rebuild the full path of the blob + String fullBlobPath = "gs://" + blob.getBucket() + "/" + blob.getName(); + + GcsSource gcsSource = GcsSource.newBuilder() + .setUri(fullBlobPath) + .build(); + + InputConfig inputConfig = InputConfig.newBuilder() + .setMimeType(config.getMimeType()) + .setGcsSource(gcsSource) + .build(); + + ImageContext context = ImageContext.newBuilder() + .addAllLanguageHints(config.getLanguageHintsList()) + .build(); + + AsyncAnnotateFileRequest request = AsyncAnnotateFileRequest.newBuilder() + .addFeatures(feature) + .setImageContext(context) + .setInputConfig(inputConfig) + .setOutputConfig(outputConfig) + .build(); + + // Add this request to the list + requests.add(request); + } + + // Send the requests + AsyncBatchAnnotateFilesRequest asyncRequest = AsyncBatchAnnotateFilesRequest.newBuilder() + .addAllRequests(requests) + .build(); + + // Wait for the future to complete + client.asyncBatchAnnotateFilesAsync(asyncRequest) + .getInitialFuture() + .get(); + } + } catch (Exception exception) { + throw new IllegalStateException(exception); + } + } +} diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionConfig.java b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionConfig.java similarity index 79% rename from src/main/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionConfig.java rename to src/main/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionConfig.java index cdebcef..16f97c2 100644 --- a/src/main/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionConfig.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionConfig.java @@ -23,49 +23,44 @@ import io.cdap.cdap.api.plugin.PluginConfig; import io.cdap.cdap.etl.api.FailureCollector; import io.cdap.plugin.cloud.vision.CloudVisionConstants; - import java.util.Arrays; import java.util.Collections; import java.util.List; import javax.annotation.Nullable; /** - * Config class for {@link TextExtractorAction}. + * Config class for {@link OfflineTextExtractorAction}. */ -public class TextExtractorActionConfig extends PluginConfig { - - @Name(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH) - @Description("Path on the local file system of the service account key used " - + "for authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. " - + "When running on other clusters, the file must be present on every node in the cluster.") - @Macro - private String serviceFilePath; +public class OfflineTextExtractorActionConfig extends PluginConfig { @Name(ActionConstants.SOURCE_PATH) @Macro @Description("Path to the location of the directory on GCS where the input files are stored.") - private final String sourcePath; - + private String sourcePath; @Name(ActionConstants.DESTINATION_PATH) @Macro @Description("Path to the location of the directory on GCS where output files should be stored.") - private final String destinationPath; - + private String destinationPath; @Name(CloudVisionConstants.MIME_TYPE) @Description("Document type.") private final String mimeType; - @Name(ActionConstants.BATCH_SIZE) @Description("The max number of responses.") private final Integer batchSize; - @Name(CloudVisionConstants.LANGUAGE_HINTS) @Nullable @Description("Optional hints to provide to Cloud Vision API.") private final String languageHints; + @Name(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH) + @Description("Path on the local file system of the service account key used " + + "for authorization. Can be set to 'auto-detect' when running on a Dataproc cluster. " + + "When running on other clusters, the file must be present on every node in the cluster.") + @Macro + private String serviceFilePath; - public TextExtractorActionConfig(String serviceFilePath, String sourcePath, String destinationPath, String mimeType, - Integer batchSize, @Nullable String languageHints) { + public OfflineTextExtractorActionConfig(String serviceFilePath, String sourcePath, + String destinationPath, String mimeType, + Integer batchSize, @Nullable String languageHints) { this.serviceFilePath = serviceFilePath; this.sourcePath = sourcePath; this.destinationPath = destinationPath; @@ -74,7 +69,7 @@ public TextExtractorActionConfig(String serviceFilePath, String sourcePath, Stri this.languageHints = languageHints; } - private TextExtractorActionConfig(Builder builder) { + private OfflineTextExtractorActionConfig(Builder builder) { serviceFilePath = builder.serviceFilePath; sourcePath = builder.sourcePath; destinationPath = builder.destinationPath; @@ -87,7 +82,13 @@ public static Builder builder() { return new Builder(); } - public static Builder builder(TextExtractorActionConfig copy) { + /** + * Helper function to get a Builder object based on an existing configuration. + * + * @param copy Configuration object to use as the source to copy from. + * @return Builer object. + */ + public static Builder builder(OfflineTextExtractorActionConfig copy) { Builder builder = new Builder(); builder.setServiceFilePath(copy.getServiceFilePath()); @@ -106,7 +107,8 @@ public String getServiceFilePath() { @Nullable public String getServiceAccountFilePath() { - if (containsMacro(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH) || Strings.isNullOrEmpty(serviceFilePath)) { + if (containsMacro(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH) + || Strings.isNullOrEmpty(serviceFilePath)) { return null; } @@ -142,8 +144,22 @@ public List getLanguageHintsList() { return Collections.emptyList(); } + public void setSourcePath(String sourcePath) { + this.sourcePath = sourcePath; + } + + public void setDestinationPath(String destinationPath) { + this.destinationPath = destinationPath; + } + + /** + * Validate that the configuration is correct. If not, use the FailureCollector object passed to report errors. + * + * @param collector FailureCollector object to use to report errors. + */ public void validate(FailureCollector collector) { - if (!containsMacro(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH) && Strings.isNullOrEmpty(serviceFilePath)) { + if (!containsMacro(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH) + && Strings.isNullOrEmpty(serviceFilePath)) { collector.addFailure("Service account file path must be specified.", null) .withConfigProperty(CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH); } @@ -160,7 +176,7 @@ public void validate(FailureCollector collector) { } /** - * Builder for creating a {@link TextExtractorActionConfig}. + * Builder for creating a {@link OfflineTextExtractorActionConfig}. */ public static final class Builder { private String serviceFilePath; @@ -205,8 +221,8 @@ public Builder setLanguageHints(@Nullable String languageHints) { return this; } - public TextExtractorActionConfig build() { - return new TextExtractorActionConfig(this); + public OfflineTextExtractorActionConfig build() { + return new OfflineTextExtractorActionConfig(this); } } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/action/TextExtractorAction.java b/src/main/java/io/cdap/plugin/cloud/vision/action/TextExtractorAction.java deleted file mode 100644 index 79c93a9..0000000 --- a/src/main/java/io/cdap/plugin/cloud/vision/action/TextExtractorAction.java +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright © 2019 Cask Data, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); you may not - * use this file except in compliance with the License. You may obtain a copy of - * the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT - * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the - * License for the specific language governing permissions and limitations under - * the License. - */ - -package io.cdap.plugin.cloud.vision.action; - -import com.google.api.gax.core.FixedCredentialsProvider; -import com.google.auth.Credentials; -import com.google.cloud.vision.v1.AsyncAnnotateFileRequest; -import com.google.cloud.vision.v1.AsyncBatchAnnotateFilesRequest; -import com.google.cloud.vision.v1.Feature; -import com.google.cloud.vision.v1.GcsDestination; -import com.google.cloud.vision.v1.GcsSource; -import com.google.cloud.vision.v1.ImageAnnotatorClient; -import com.google.cloud.vision.v1.ImageAnnotatorSettings; -import com.google.cloud.vision.v1.ImageContext; -import com.google.cloud.vision.v1.InputConfig; -import com.google.cloud.vision.v1.OutputConfig; -import io.cdap.cdap.api.annotation.Description; -import io.cdap.cdap.api.annotation.Name; -import io.cdap.cdap.api.annotation.Plugin; -import io.cdap.cdap.etl.api.FailureCollector; -import io.cdap.cdap.etl.api.PipelineConfigurer; -import io.cdap.cdap.etl.api.action.Action; -import io.cdap.cdap.etl.api.action.ActionContext; -import io.cdap.plugin.cloud.vision.CredentialsHelper; -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; - -import java.util.ArrayList; -import java.util.List; - -/** - * Action that runs offline document text extractor. - */ -@Plugin(type = Action.PLUGIN_TYPE) -@Name(TextExtractorAction.PLUGIN_NAME) -@Description("Action that runs offline document text extractor") -public class TextExtractorAction extends Action { - public static final String PLUGIN_NAME = "TextExtractorOffline"; - private static final Logger LOG = LoggerFactory.getLogger(TextExtractorAction.class); - - private final TextExtractorActionConfig config; - - public TextExtractorAction(TextExtractorActionConfig config) { - this.config = config; - } - - @Override - public void configurePipeline(PipelineConfigurer pipelineConfigurer) { - FailureCollector collector = pipelineConfigurer.getStageConfigurer().getFailureCollector(); - config.validate(collector); - collector.getOrThrowException(); - } - - @Override - public void run(ActionContext actionContext) throws Exception { - FailureCollector collector = actionContext.getFailureCollector(); - config.validate(collector); - collector.getOrThrowException(); - - Credentials credentials = CredentialsHelper.getCredentials(config.getServiceAccountFilePath()); - ImageAnnotatorSettings imageAnnotatorSettings = ImageAnnotatorSettings.newBuilder() - .setCredentialsProvider(FixedCredentialsProvider.create(credentials)) - .build(); - - try (ImageAnnotatorClient client = ImageAnnotatorClient.create(imageAnnotatorSettings)) { - List requests = new ArrayList<>(); - - GcsSource gcsSource = GcsSource.newBuilder() - .setUri(config.getSourcePath()) - .build(); - - GcsDestination gcsDestination = GcsDestination.newBuilder() - .setUri(config.getDestinationPath()) - .build(); - - InputConfig inputConfig = InputConfig.newBuilder() - .setMimeType(config.getMimeType()) - .setGcsSource(gcsSource) - .build(); - - OutputConfig outputConfig = OutputConfig.newBuilder() - .setBatchSize(config.getBatchSize()) - .setGcsDestination(gcsDestination) - .build(); - - Feature feature = Feature.newBuilder() - .setType(Feature.Type.DOCUMENT_TEXT_DETECTION) - .build(); - - ImageContext context = ImageContext.newBuilder() - .addAllLanguageHints(config.getLanguageHintsList()) - .build(); - - AsyncAnnotateFileRequest request = AsyncAnnotateFileRequest.newBuilder() - .addFeatures(feature) - .setImageContext(context) - .setInputConfig(inputConfig) - .setOutputConfig(outputConfig) - .build(); - - requests.add(request); - AsyncBatchAnnotateFilesRequest asyncRequest = AsyncBatchAnnotateFilesRequest.newBuilder() - .addAllRequests(requests) - .build(); - - client.asyncBatchAnnotateFilesAsync(asyncRequest) - .getInitialFuture() - .get(); - } - } -} diff --git a/src/main/java/io/cdap/plugin/cloud/vision/exception/CloudVisionExecutionException.java b/src/main/java/io/cdap/plugin/cloud/vision/exception/CloudVisionExecutionException.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/CloudVisionClient.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/CloudVisionClient.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfig.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfig.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConstants.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConstants.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/ImageFeature.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/ImageFeature.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/ProductCategory.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/ProductCategory.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentAnnotatorClient.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentAnnotatorClient.java old mode 100644 new mode 100755 index 607e44a..d2fc144 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentAnnotatorClient.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentAnnotatorClient.java @@ -44,6 +44,14 @@ public DocumentAnnotatorClient(DocumentExtractorTransformConfig config) { this.config = config; } + /** + * Convenience method that wraps a call to extractDocumentFeature(InputConfig inputConfig) by building an + * {@link InputConfig} object from a Byte array containing the image bytes. + * + * @param content Byte array containing the image bytes to ask the cloud vision API to work on. + * @return {@link AnnotateFileResponse} with the information requested from the cloud vision API. + * @throws Exception if there was an error sent back by the cloud vision API. + */ public AnnotateFileResponse extractDocumentFeature(byte[] content) throws Exception { InputConfig inputConfig = InputConfig.newBuilder() .setContent(ByteString.copyFrom(content)) @@ -52,6 +60,14 @@ public AnnotateFileResponse extractDocumentFeature(byte[] content) throws Except return extractDocumentFeature(inputConfig); } + /** + * Convenience method that wraps a call to extractDocumentFeature(InputConfig inputConfig) by building an + * {@link InputConfig} object from a gcsPath pointing to a blob in GCS. + * + * @param gcsPath Path to the blob to ask the cloud vision API to work on. + * @return {@link AnnotateFileResponse} with the information requested from the cloud vision API. + * @throws Exception if there was an error sent back by the cloud vision API. + */ public AnnotateFileResponse extractDocumentFeature(String gcsPath) throws Exception { InputConfig inputConfig = InputConfig.newBuilder() .setGcsSource(GcsSource.newBuilder().setUri(gcsPath)) @@ -60,6 +76,11 @@ public AnnotateFileResponse extractDocumentFeature(String gcsPath) throws Except return extractDocumentFeature(inputConfig); } + /** + * @param inputConfig Object that contains the information to send to the Cloud vision API. + * @return {@link AnnotateFileResponse} with the information requested from the cloud vision API. + * @throws Exception if there was an error sent back by the cloud vision API. + */ public AnnotateFileResponse extractDocumentFeature(InputConfig inputConfig) throws Exception { try (ImageAnnotatorClient client = createImageAnnotatorClient()) { Feature.Type featureType = config.getImageFeature().getFeatureType(); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransform.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransform.java old mode 100644 new mode 100755 index b8346c1..ca6771e --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransform.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransform.java @@ -57,34 +57,26 @@ public class DocumentExtractorTransform extends Transform emitter) { try { + // There are two ways the cloud vision API can be called. + // 1. By providing the path to a blob in GCS. + // 2. By providing the actual bytes of the image file. if (!Strings.isNullOrEmpty(config.getPathField())) { transformPath(input, emitter); } else { @@ -118,6 +113,13 @@ public void transform(StructuredRecord input, Emitter emitter) } } + /** + * Method that gets a response back from the cloud vision API by providing the path to an image blob in GCS. + * + * @param input {@link StructuredRecord} passed in by CDAP to work with. It contains the actual path to use. + * @param emitter {@link Emitter} object to use to send the response back to CDAP + * @throws Exception Raised if there was an error coming back from the cloud vision API. + */ private void transformPath(StructuredRecord input, Emitter emitter) throws Exception { String documentPath = input.get(config.getPathField()); AnnotateFileResponse response = documentAnnotatorClient.extractDocumentFeature(documentPath); @@ -125,6 +127,13 @@ private void transformPath(StructuredRecord input, Emitter emi emitter.emit(transformed); } + /** + * Method that gets a response back from the cloud vision API by providing the actual bytes of the image file. + * + * @param input {@link StructuredRecord} passed in by CDAP to work with. It contains the actual bytes to use. + * @param emitter {@link Emitter} object to use to send the response back to CDAP + * @throws Exception Raised if there was an error coming back from the cloud vision API. + */ private void transformBytes(StructuredRecord input, Emitter emitter) throws Exception { Object content = input.get(config.getPathField()); byte[] contentBytes = content instanceof ByteBuffer ? Bytes.getBytes((ByteBuffer) content) : (byte[]) content; @@ -133,14 +142,22 @@ private void transformBytes(StructuredRecord input, Emitter em emitter.emit(transformed); } - public Schema getSchema() { + /** + * Get the output Schema to use by combining the input Schema from CDAP and add the fields needed to store the + * information coming back from the cloud vision API. + * + * @return {@link Schema} + */ + public Schema getOutputSchema() { List fields = new ArrayList<>(); - if (inputSchema.getFields() != null) { + // Add the input fields + if (inputSchema != null && inputSchema.getFields() != null) { fields.addAll(inputSchema.getFields()); } - + // Add the fields of the image feature schema Schema pagesSchema = pagesSchema(config.getImageFeature().getSchema()); fields.add(Schema.Field.of(config.getOutputField(), pagesSchema)); + // Build a schema combining all return Schema.recordOf("record", fields); } @@ -148,7 +165,7 @@ public Schema getSchema() { * File Annotation mapped to record with field "page" for page number and "feature" field for extracted image feature. * * @param imageFeatureSchema extracted image feature schema. - * @return File Annotation page schema. + * @return File Annotation page {@link Schema}. */ private Schema pagesSchema(Schema imageFeatureSchema) { return Schema.arrayOf( diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfig.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfig.java old mode 100644 new mode 100755 index 1cd879b..ed4e753 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfig.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfig.java @@ -78,6 +78,11 @@ public String getPages() { return pages; } + /** + * Convenience method that splits a comma separated string of values and turn those into a List. + * + * @return {@link List} + */ public List getPagesList() { if (Strings.isNullOrEmpty(pages)) { return Collections.emptyList(); @@ -117,14 +122,17 @@ public void validate(FailureCollector collector) { */ public void validateInputSchema(Schema inputSchema, FailureCollector collector) { Schema.Field contentField = inputSchema.getField(getContentField()); - if (contentField != null) { - collector.addFailure(String.format("Content field '%s' is expected to be 'bytes'", getContentField()), null) - .withInputSchemaField(getContentField()); - } Schema.Field pathField = inputSchema.getField(getPathField()); - if (pathField != null) { + + if (pathField != null && pathField.getSchema().getType() != Schema.Type.STRING) { collector.addFailure(String.format("Path field '%s' is expected to be a string", getPathField()), null) .withInputSchemaField(getPathField()); + return; + } + + if (contentField != null && contentField.getSchema().getType() != Schema.Type.STRING) { + collector.addFailure(String.format("Content field '%s' is expected to be a string", getContentField()), null) + .withInputSchemaField(getContentField()); } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConstants.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConstants.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/document/transformer/FileAnnotationToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/document/transformer/FileAnnotationToRecordTransformer.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageAnnotatorClient.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageAnnotatorClient.java old mode 100644 new mode 100755 index f345b7b..87d0052 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageAnnotatorClient.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageAnnotatorClient.java @@ -43,12 +43,19 @@ public ImageAnnotatorClient(ImageExtractorTransformConfig config) { this.config = config; } + /** + * @param gcsPath {@link String} that contains the path to a blon in GCS to use with the Cloud vision API. + * @return {@link AnnotateImageResponse} with the information requested from the cloud vision API. + * @throws Exception if there was an error sent back by the cloud vision API. + */ public AnnotateImageResponse extractImageFeature(String gcsPath) throws Exception { try (com.google.cloud.vision.v1.ImageAnnotatorClient client = createImageAnnotatorClient()) { ImageSource imgSource = ImageSource.newBuilder().setGcsImageUri(gcsPath).build(); Image img = Image.newBuilder().setSource(imgSource).build(); + Feature.Type featureType = config.getImageFeature().getFeatureType(); Feature feature = Feature.newBuilder().setType(featureType).build(); + AnnotateImageRequest.Builder request = AnnotateImageRequest.newBuilder().addFeatures(feature).setImage(img); ImageContext imageContext = getImageContext(); if (imageContext != null) { @@ -56,13 +63,13 @@ public AnnotateImageResponse extractImageFeature(String gcsPath) throws Exceptio } BatchAnnotateImagesResponse response = client.batchAnnotateImages(Collections.singletonList(request.build())); + AnnotateImageResponse annotateImageResponse = response.getResponses(SINGLE_RESPONSE_INDEX); if (annotateImageResponse.hasError()) { String errorMessage = String.format("Unable to extract '%s' feature of image '%s' due to: '%s'", featureType, gcsPath, annotateImageResponse.getError().getMessage()); throw new CloudVisionExecutionException(errorMessage); } - return annotateImageResponse; } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransform.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransform.java old mode 100644 new mode 100755 index 0c97575..14d35e6 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransform.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransform.java @@ -55,27 +55,21 @@ public class ImageExtractorTransform extends Transform emitter) } } - public Schema getSchema() { + /** + * Get the output Schema to use by combining the input Schema from CDAP and add the fields needed to store the + * information coming back from the cloud vision API. + * + * @return {@link Schema} + */ + protected Schema getOutputSchema(Schema inputSchema) { List fields = new ArrayList<>(); - if (inputSchema.getFields() != null) { + // Add the input fields + if (inputSchema != null && inputSchema.getFields() != null) { fields.addAll(inputSchema.getFields()); } - + // Add the fields of the image feature schema fields.add(Schema.Field.of(config.getOutputField(), config.getImageFeature().getSchema())); + // Build a schema combining all return Schema.recordOf("record", fields); } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfig.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfig.java old mode 100644 new mode 100755 index fddb321..b344b36 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfig.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfig.java @@ -23,6 +23,7 @@ import io.cdap.cdap.api.annotation.Description; import io.cdap.cdap.api.annotation.Macro; import io.cdap.cdap.api.annotation.Name; +import io.cdap.cdap.api.data.schema.Schema; import io.cdap.cdap.api.plugin.PluginConfig; import io.cdap.cdap.etl.api.FailureCollector; import io.cdap.plugin.cloud.vision.transform.ExtractorTransformConfig; @@ -133,4 +134,21 @@ public void validate(FailureCollector collector) { } } } + + /** + * Validates input schema and checks for type compatibility. + * + * @param inputSchema input schema. + * @param collector failure collector. + */ + public void validateInputSchema(Schema inputSchema, FailureCollector collector) { + Schema.Field pathField = inputSchema.getField(getPathField()); + if (pathField != null && !(pathField.getSchema().getType() == Schema.Type.STRING)) { + collector.addFailure( + String.format("Path field '%s' is expected to be a string", getPathField()), + null).withInputSchemaField(getPathField()); + } + } + + } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/ColorInfoSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/ColorInfoSchema.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/CropHintAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/CropHintAnnotationSchema.java old mode 100644 new mode 100755 index 9aa356b..38334bb --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/CropHintAnnotationSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/CropHintAnnotationSchema.java @@ -45,7 +45,7 @@ private CropHintAnnotationSchema() { public static final Schema SCHEMA = Schema.recordOf( "crop-hint-annotation-component-record", - Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA)), + Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.getSchema("cropHintAnnotation-position"))), Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(IMPORTANCE_FRACTION_FIELD_NAME, Schema.of(Schema.Type.FLOAT))); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/EntityAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/EntityAnnotationSchema.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/EntityAnnotationWithPositionSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/EntityAnnotationWithPositionSchema.java old mode 100644 new mode 100755 index 053b9a8..ed657a5 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/EntityAnnotationWithPositionSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/EntityAnnotationWithPositionSchema.java @@ -40,7 +40,9 @@ private EntityAnnotationWithPositionSchema() { Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(TOPICALITY_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(LOCATIONS_FIELD_NAME, Schema.arrayOf(LocationInfo.SCHEMA)), - Schema.Field.of(POSITION_FIELD_NAME, Schema.nullableOf(Schema.arrayOf(VertexSchema.SCHEMA))), + Schema.Field.of(POSITION_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + VertexSchema.getSchema("entityAnnotation-position")))), Schema.Field.of(PROPERTIES_FIELD_NAME, Schema.arrayOf(Property.SCHEMA)) ); + } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FaceAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FaceAnnotationSchema.java old mode 100644 new mode 100755 index 3965d42..9cb6024 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FaceAnnotationSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FaceAnnotationSchema.java @@ -16,8 +16,11 @@ package io.cdap.plugin.cloud.vision.transform.schema; +import com.google.cloud.vision.v1.FaceAnnotation; import com.google.cloud.vision.v1.Likelihood; import io.cdap.cdap.api.data.schema.Schema; +import java.util.ArrayList; +import java.util.List; /** * {@link com.google.cloud.vision.v1.FaceAnnotation} mapped to a record with following fields. @@ -59,52 +62,54 @@ private FaceAnnotationSchema() { /** * Anger likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String ANGER_FIELD_NAME = "anger"; + public static final String ANGER_FIELD_NAME = "angerLikelihood"; /** * Joy likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String JOY_FIELD_NAME = "joy"; + public static final String JOY_FIELD_NAME = "joyLikelihood"; /** * Surprise likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String SURPRISE_FIELD_NAME = "surprise"; + public static final String SURPRISE_FIELD_NAME = "surpriseLikelihood"; /** * Blurred likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String BLURRED_FIELD_NAME = "blurred"; + public static final String BLURRED_FIELD_NAME = "blurredLikelihood"; /** * Under exposed likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String UNDER_EXPOSED_FIELD_NAME = "underExposed"; + public static final String UNDER_EXPOSED_FIELD_NAME = "underExposedLikelihood"; /** * Sorrow likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String SORROW_FIELD_NAME = "sorrow"; + public static final String SORROW_FIELD_NAME = "sorrowLikelihood"; /** * Headwear likelihood. Possible values are defined by {@link Likelihood}. */ - public static final String HEADWEAR_FIELD_NAME = "headwear"; + public static final String HEADWEAR_FIELD_NAME = "headwearLikelihood"; /** - * The bounding polygon around the face. The bounding box is computed to "frame" the face in accordance with human - * expectations. It is based on the landmarker results. Note that one or more x and/or y coordinates may not be - * generated if only a partial face appears in the image to be annotated. + * The bounding polygon around the face. The bounding box is computed to "frame" the face in + * accordance with human expectations. It is based on the landmarker results. Note that one or + * more x and/or y coordinates may not be generated if only a partial face appears in the image + * to be annotated. */ - public static final String POSITION_FIELD_NAME = "position"; + public static final String BOUNDING_POLY_NAME = "boundingPoly"; /** - * The bounding polygon which is tighter than the {@link FaceAnnotationSchema#POSITION_FIELD_NAME}, and encloses only - * the skin part of the face. Typically, it is used to eliminate the face from any image analysis that detects the - * "amount of skin" visible in an image. It is not based on the landmarker results, only on the initial face - * detection, hence the fd (face detection) prefix. + * The bounding polygon, tighter than the {@link FaceAnnotationSchema#BOUNDING_POLY_NAME}, + * encloses only the skin part of the face. Typically, it is used to eliminate the face from any + * image analysis that detects the "amount of skin" visible in an image. It is not based on the + * landmarker results, only on the initial face detection, hence the fd + * (face detection) prefix. */ - public static final String FD_POSITION_FIELD_NAME = "fdPosition"; + public static final String FD_BOUNDING_POLY_NAME = "fdBoundingPoly"; /** * Detected face landmarks. @@ -125,8 +130,8 @@ private FaceAnnotationSchema() { Schema.Field.of(UNDER_EXPOSED_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(SORROW_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(HEADWEAR_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA)), - Schema.Field.of(FD_POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA)), + Schema.Field.of(BOUNDING_POLY_NAME, Schema.arrayOf(VertexSchema.getSchema("bounding-vertex"))), + Schema.Field.of(FD_BOUNDING_POLY_NAME, Schema.arrayOf(VertexSchema.getSchema("fd-bounding-vertex"))), Schema.Field.of(LANDMARKS_FIELD_NAME, Schema.arrayOf(FaceLandmark.SCHEMA)) ); @@ -135,6 +140,11 @@ private FaceAnnotationSchema() { */ public static class FaceLandmark { + + private FaceLandmark() { + throw new AssertionError("Should not instantiate static utility class."); + } + /** * Face landmark type. */ @@ -156,9 +166,26 @@ public static class FaceLandmark { public static final String Z_FIELD_NAME = "z"; public static final Schema SCHEMA = Schema.recordOf("face-landmark-record", - Schema.Field.of(TYPE_FIELD_NAME, Schema.of(Schema.Type.STRING)), + // The following field is not a simple type as it is defined as an Enum + Schema.Field.of(TYPE_FIELD_NAME, Schema.enumWith(FaceAnnotationSchema.getIterableFromEnum( + FaceAnnotation.Landmark.Type.values()))), Schema.Field.of(X_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(Y_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(Z_FIELD_NAME, Schema.of(Schema.Type.FLOAT))); } + + /** + * Helper function to build a {@link List} from an array. + * + * @param input Array of type T. + * @param Type of the input array. + * @return {@link List} + */ + protected static List getIterableFromEnum(T[] input) { + ArrayList list = new ArrayList<>(input.length); + for (T value : input) { + list.add(value.toString()); + } + return list; + } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FullTextAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FullTextAnnotationSchema.java old mode 100644 new mode 100755 index 3a7aef2..8877e4c --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FullTextAnnotationSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/FullTextAnnotationSchema.java @@ -43,7 +43,7 @@ private FullTextAnnotationSchema() { public static final String PAGES_FIELD_NAME = "pages"; public static final Schema SCHEMA = Schema.recordOf( - "document-text-annotation-component-record", + "fullTextAnnotation", Schema.Field.of(TEXT_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(PAGES_FIELD_NAME, Schema.arrayOf(TextPage.SCHEMA))); @@ -85,7 +85,7 @@ public static class TextSymbol { * 2----3 * | | * 1----0 - * and the vertice order will still be (0, 1, 2, 3). + * and the vertex order will still be (0, 1, 2, 3). */ public static final String BOUNDING_BOX_FIELD_NAME = "boundingBox"; @@ -93,9 +93,11 @@ public static class TextSymbol { "document-text-page-symbol-record", Schema.Field.of(TEXT_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), - Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf(DetectedLanguage.SCHEMA))), + Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + DetectedLanguage.getSchema("textSymbol-detectedLanguages")))), Schema.Field.of(DETECTED_BREAK_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING))), - Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA))); + Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf( + VertexSchema.getSchema("textSymbol-boundingBox")))); } /** @@ -150,9 +152,11 @@ public static class TextWord { Schema.Field.of(TEXT_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(SYMBOLS_FIELD_NAME, Schema.arrayOf(TextSymbol.SCHEMA)), - Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf(DetectedLanguage.SCHEMA))), + Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + DetectedLanguage.getSchema("textWord-detectedLanguage")))), Schema.Field.of(DETECTED_BREAK_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING))), - Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA))); + Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf( + VertexSchema.getSchema("textWord-boundingBox")))); } /** @@ -198,7 +202,7 @@ public static class TextParagraph { * 2----3 * | | * 1----0 - * and the vertice order will still be (0, 1, 2, 3). + * and the vertex order will still be (0, 1, 2, 3). */ public static final String BOUNDING_BOX_FIELD_NAME = "boundingBox"; @@ -207,9 +211,11 @@ public static class TextParagraph { Schema.Field.of(TEXT_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(WORDS_FIELD_NAME, Schema.arrayOf(TextWord.SCHEMA)), - Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf(DetectedLanguage.SCHEMA))), + Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + DetectedLanguage.getSchema("textParagraph-detectedLanguages")))), Schema.Field.of(DETECTED_BREAK_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING))), - Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA))); + Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf( + VertexSchema.getSchema("textParagraph-boundingBox")))); } /** @@ -260,7 +266,7 @@ public static class TextBlock { * 2----3 * | | * 1----0 - * and the vertice order will still be (0, 1, 2, 3). + * and the vertex order will still be (0, 1, 2, 3). */ public static final String BOUNDING_BOX_FIELD_NAME = "boundingBox"; @@ -270,9 +276,11 @@ public static class TextBlock { Schema.Field.of(BLOCK_TYPE_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), Schema.Field.of(PARAGRAPHS_FIELD_NAME, Schema.arrayOf(TextParagraph.SCHEMA)), - Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf(DetectedLanguage.SCHEMA))), + Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + DetectedLanguage.getSchema("textBlock-detectedLanguages")))), Schema.Field.of(DETECTED_BREAK_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING))), - Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA))); + Schema.Field.of(BOUNDING_BOX_FIELD_NAME, Schema.arrayOf( + VertexSchema.getSchema("textBlock-boundingBox")))); } /** @@ -305,6 +313,11 @@ public static class TextPage { */ public static final String BLOCKS_FIELD_NAME = "blocks"; + /** + * Property section that contains detected languages. + */ + public static final String PROPERTY_FIELD_NAME = "property"; + /** * A list of detected languages together with confidence. */ @@ -318,11 +331,14 @@ public static class TextPage { public static final Schema SCHEMA = Schema.recordOf( "document-text-page-record", Schema.Field.of(TEXT_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(WIDTH_FIELD_NAME, Schema.of(Schema.Type.INT)), - Schema.Field.of(HEIGHT_FIELD_NAME, Schema.of(Schema.Type.INT)), + Schema.Field.of(WIDTH_FIELD_NAME, Schema.of(Schema.Type.INT)), // + Schema.Field.of(HEIGHT_FIELD_NAME, Schema.of(Schema.Type.INT)), // Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), - Schema.Field.of(BLOCKS_FIELD_NAME, Schema.arrayOf(TextBlock.SCHEMA)), - Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf(DetectedLanguage.SCHEMA))), + Schema.Field.of(BLOCKS_FIELD_NAME, Schema.arrayOf(TextBlock.SCHEMA)), // + Schema.Field.of(PROPERTY_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + DetectedLanguage.getSchema("textPage-property")))), + Schema.Field.of(DETECTED_LANGUAGES_FIELD_NAME, Schema.nullableOf(Schema.arrayOf( + DetectedLanguage.getSchema("textPage-detectedLanguages")))), Schema.Field.of(DETECTED_BREAK_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING)))); } @@ -333,19 +349,28 @@ public static class TextPage { public static class DetectedLanguage { /** - * The BCP-47 language code, such as "en-US" or "sr-Latn". For more information, see + * The BCP-47 language code, such as "en-US" or "sr-Latin". For more information, see * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier. */ - public static final String CODE_FIELD_NAME = "code"; + public static final String LANGUAGE_CODE_FIELD_NAME = "languageCode"; /** * Confidence of detected language. Range [0, 1]. */ public static final String CONFIDENCE_FIELD_NAME = "confidence"; - public static final Schema SCHEMA = Schema.recordOf( - "detected-language-record", - Schema.Field.of(CODE_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT))); + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ + public static Schema getSchema(String name) { + return Schema.recordOf( + name, + Schema.Field.of(LANGUAGE_CODE_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(CONFIDENCE_FIELD_NAME, Schema.of(Schema.Type.FLOAT))); + } } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/LocalizedObjectAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/LocalizedObjectAnnotationSchema.java old mode 100644 new mode 100755 index 26b7b7c..0dbab7d --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/LocalizedObjectAnnotationSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/LocalizedObjectAnnotationSchema.java @@ -54,12 +54,19 @@ protected LocalizedObjectAnnotationSchema() { */ public static final String POSITION_FIELD_NAME = "position"; + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ public static final Schema SCHEMA = Schema.recordOf( "localized-object-annotation-component-record", Schema.Field.of(MID_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(LANGUAGE_CODE_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(NAME_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), - Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA)) + Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.getSchema("localizedObjectAnnotation-position"))) ); } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/ProductSearchResultsSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/ProductSearchResultsSchema.java old mode 100644 new mode 100755 index 026024e..dd8446b --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/ProductSearchResultsSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/ProductSearchResultsSchema.java @@ -46,9 +46,9 @@ private ProductSearchResultsSchema() { */ public static final String GROUPED_RESULTS_FIELD_NAME = "productGroupedResults"; - public static final Schema SCHEMA = Schema.recordOf("product-search-result-record", + public static final Schema SCHEMA = Schema.recordOf("productSearch-resultRecord", Schema.Field.of(INDEX_TIME_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING))), - Schema.Field.of(RESULTS_FIELD_NAME, Schema.arrayOf(Result.SCHEMA)), + Schema.Field.of(RESULTS_FIELD_NAME, Schema.arrayOf(Result.getSchema("productSearch-result"))), Schema.Field.of(GROUPED_RESULTS_FIELD_NAME, Schema.arrayOf(GroupedResult.SCHEMA)) ); @@ -72,11 +72,20 @@ public static class Result { */ public static final String PRODUCT_FIELD_NAME = "product"; - public static final Schema SCHEMA = Schema.recordOf("result-record", - Schema.Field.of(IMAGE_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), - Schema.Field.of(PRODUCT_FIELD_NAME, Product.SCHEMA) - ); + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ + public static Schema getSchema(String name) { + return Schema.recordOf(name, + Schema.Field.of(IMAGE_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), + Schema.Field.of(PRODUCT_FIELD_NAME, Product.getSchema(name + "result-product")) + ); + } } /** @@ -111,13 +120,24 @@ public static class Product { */ public static final String PRODUCT_LABELS_FIELD_NAME = "productLabels"; - public static final Schema SCHEMA = Schema.recordOf("product-record", - Schema.Field.of(NAME_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(DISPLAY_NAME_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(DESCRIPTION_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(PRODUCT_CATEGORY_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(PRODUCT_LABELS_FIELD_NAME, Schema.arrayOf(KeyValue.SCHEMA)) - ); + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ + public static Schema getSchema(String name) { + return Schema.recordOf(name, + Schema.Field.of(NAME_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(DISPLAY_NAME_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(DESCRIPTION_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(PRODUCT_CATEGORY_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(PRODUCT_LABELS_FIELD_NAME, Schema.arrayOf( + KeyValue.getSchema(name + "-keyValue"))) + ); + } + } /** @@ -135,10 +155,19 @@ public static class KeyValue { */ public static final String VALUE_FIELD_NAME = "value"; - public static final Schema SCHEMA = Schema.recordOf("key-value-record", - Schema.Field.of(KEY_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(VALUE_FIELD_NAME, Schema.of(Schema.Type.STRING)) - ); + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ + public static Schema getSchema(String name) { + return Schema.recordOf(name, + Schema.Field.of(KEY_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(VALUE_FIELD_NAME, Schema.of(Schema.Type.STRING)) + ); + } } /** @@ -156,9 +185,10 @@ public static class GroupedResult { */ public static final String RESULTS_FIELD_NAME = "results"; - public static final Schema SCHEMA = Schema.recordOf("grouped-result-record", - Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA)), - Schema.Field.of(RESULTS_FIELD_NAME, Schema.arrayOf(Result.SCHEMA)) + public static final Schema SCHEMA = Schema.recordOf("groupedResult-record", + Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf( + VertexSchema.getSchema("groupedResult-position"))), + Schema.Field.of(RESULTS_FIELD_NAME, Schema.arrayOf(Result.getSchema("groupedResult-results"))) ); } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/SafeSearchAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/SafeSearchAnnotationSchema.java old mode 100644 new mode 100755 diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/TextAnnotationSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/TextAnnotationSchema.java old mode 100644 new mode 100755 index ac518dc..f5b15f9 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/TextAnnotationSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/TextAnnotationSchema.java @@ -23,6 +23,7 @@ */ public class TextAnnotationSchema { + private TextAnnotationSchema() { throw new AssertionError("Should not instantiate static utility class."); } @@ -47,5 +48,6 @@ private TextAnnotationSchema() { "text-annotation-component-record", Schema.Field.of(LOCALE_FIELD_NAME, Schema.nullableOf(Schema.of(Schema.Type.STRING))), Schema.Field.of(DESCRIPTION_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.SCHEMA))); + Schema.Field.of(POSITION_FIELD_NAME, Schema.arrayOf(VertexSchema.getSchema("textAnnotation-position"))) + ); } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/VertexSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/VertexSchema.java old mode 100644 new mode 100755 index c441735..ef5ec78 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/VertexSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/VertexSchema.java @@ -19,11 +19,14 @@ import io.cdap.cdap.api.data.schema.Schema; /** - * A vertex represents a 2D point in the image. {@link com.google.cloud.vision.v1.Vertex} mapped to a record with + * A vertex represents a 2D point in the image. {@link com.google.cloud.vision.v1.Vertex} mapped to a record with the * following fields. */ public class VertexSchema { + /** + * Prevent instantiating this class. + */ private VertexSchema() { throw new AssertionError("Should not instantiate static utility class."); } @@ -38,7 +41,16 @@ private VertexSchema() { */ public static final String Y_FIELD_NAME = "y"; - public static final Schema SCHEMA = Schema.recordOf("vertex-record", - Schema.Field.of(X_FIELD_NAME, Schema.of(Schema.Type.INT)), - Schema.Field.of(Y_FIELD_NAME, Schema.of(Schema.Type.INT))); + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ + public static Schema getSchema(String name) { + return Schema.recordOf(name, + Schema.Field.of(X_FIELD_NAME, Schema.of(Schema.Type.INT)), + Schema.Field.of(Y_FIELD_NAME, Schema.of(Schema.Type.INT))); + } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/WebDetectionSchema.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/WebDetectionSchema.java old mode 100644 new mode 100755 index d68b015..c3c8be5 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/WebDetectionSchema.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/schema/WebDetectionSchema.java @@ -61,10 +61,10 @@ private WebDetectionSchema() { public static final Schema SCHEMA = Schema.recordOf( "web-detection-record", Schema.Field.of(ENTITIES_FIELD_NAME, Schema.arrayOf(WebEntity.SCHEMA)), - Schema.Field.of(FULL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.SCHEMA)), - Schema.Field.of(PARTIAL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.SCHEMA)), + Schema.Field.of(FULL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.getSchema("fmiWebImage"))), + Schema.Field.of(PARTIAL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.getSchema("pmiWebImage"))), Schema.Field.of(PAGES_WITH_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebPage.SCHEMA)), - Schema.Field.of(VISUALLY_SIMILAR_IMAGES, Schema.arrayOf(WebImage.SCHEMA)), + Schema.Field.of(VISUALLY_SIMILAR_IMAGES, Schema.arrayOf(WebImage.getSchema("vsiWebImage"))), Schema.Field.of(BEST_GUESS_LABELS_FIELD_NAME, Schema.arrayOf(BestGuessLabel.SCHEMA)) ); @@ -132,8 +132,8 @@ public static class WebPage { Schema.Field.of(URL_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(PAGE_TITLE_FIELD_NAME, Schema.of(Schema.Type.STRING)), Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)), - Schema.Field.of(FULL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.SCHEMA)), - Schema.Field.of(PARTIAL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.SCHEMA)) + Schema.Field.of(FULL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.getSchema("webPage-fmiWebImage"))), + Schema.Field.of(PARTIAL_MATCHING_IMAGES_FIELD_NAME, Schema.arrayOf(WebImage.getSchema("webPage-pmiWebImage"))) ); } @@ -148,7 +148,7 @@ public static class BestGuessLabel { public static final String LABEL_FIELD_NAME = "label"; /** - * The BCP-47 language code, such as "en-US" or "sr-Latn". For more information, see + * The BCP-47 language code, such as "en-US" or "sr-Latin". For more information, see * http://www.unicode.org/reports/tr35/#Unicode_locale_identifier. */ public static final String LANGUAGE_CODE_FIELD_NAME = "languageCode"; @@ -175,10 +175,18 @@ public static class WebImage { */ public static final String SCORE_FIELD_NAME = "score"; - public static final Schema SCHEMA = Schema.recordOf( - "web-image-record", - Schema.Field.of(URL_FIELD_NAME, Schema.of(Schema.Type.STRING)), - Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)) - ); + /** + * Utility method to create a {@link Schema} with a specific name. This is useful to create uniquely named schemas + * that will be combined into a larger {@link Schema}. + * + * @param name {@link String} containing the name to give to the returned {@link Schema}. + * @return a {@link Schema} with the given name. + */ + public static Schema getSchema(String name) { + return Schema.recordOf(name, + Schema.Field.of(URL_FIELD_NAME, Schema.of(Schema.Type.STRING)), + Schema.Field.of(SCORE_FIELD_NAME, Schema.of(Schema.Type.FLOAT)) + ); + } } } diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/CropHintsAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/CropHintsAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index eb579ea..37d9d14 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/CropHintsAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/CropHintsAnnotationsToRecordTransformer.java @@ -35,8 +35,16 @@ public CropHintsAnnotationsToRecordTransformer(Schema schema, String outputField super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link CropHintAnnotationSchema}. This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override - public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { + public StructuredRecord transform(StructuredRecord input, + AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) .set(outputFieldName, extractCropHintsAnnotations(annotateImageResponse)) .build(); @@ -48,9 +56,16 @@ private List extractCropHintsAnnotations(AnnotateImageResponse .collect(Collectors.toList()); } + /** + * Extract a {@link StructuredRecord} from a {@link CropHint} passed in. + * + * @param hint Contains the {@link CropHint} information to use. + * @return {@link StructuredRecord} that contains the data mapped to the {@link Schema}. + */ private StructuredRecord extractCropHintRecord(CropHint hint) { Schema hintSchema = getCropHintsAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(hintSchema); + Schema.Field positionField = hintSchema.getField(CropHintAnnotationSchema.POSITION_FIELD_NAME); if (positionField != null) { Schema positionSchema = getComponentSchema(positionField); @@ -74,7 +89,7 @@ private StructuredRecord extractCropHintRecord(CropHint hint) { * Retrieves Crop Hints Annotation's non-nullable component schema. Crop Hints Annotation's schema is retrieved * instead of using constant schema since users are free to choose to not include some of the fields. * - * @return Crop Hints Annotation's non-nullable component schema. + * @return Crop Hints Annotation's non-nullable component {@link Schema}. */ private Schema getCropHintsAnnotationSchema() { Schema.Field cropHintsAnnotationsField = schema.getField(outputFieldName); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index 4373fbe..e1d4ad6 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformer.java @@ -35,19 +35,39 @@ public FaceAnnotationsToRecordTransformer(Schema schema, String outputFieldName) super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link FaceAnnotationSchema}. This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { - return getOutputRecordBuilder(input) - .set(outputFieldName, extractFaceAnnotations(annotateImageResponse)) - .build(); + StructuredRecord.Builder builder = getOutputRecordBuilder(input); + List extracted = extractFaceAnnotations(annotateImageResponse); + return builder.set(outputFieldName, extracted).build(); } + /** + * Extract a complete {@link List} of {@link StructuredRecord} objects mapped from the {@link AnnotateImageResponse} + * passed in, using the {@link FaceAnnotationSchema}. + * + * @param annotateImageResponse Input {@link AnnotateImageResponse} object to get the data from. + * @return {@link List} containing the {@link StructuredRecord} mapped from the input. + */ private List extractFaceAnnotations(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getFaceAnnotationsList().stream() .map(this::extractFaceAnnotationRecord) .collect(Collectors.toList()); } + /** + * Extract a {@link StructuredRecord} from a {@link FaceAnnotation} input. + * + * @param annotation The {@link FaceAnnotation} object to get the data from. + * @return {@link StructuredRecord} containing the data mapped to the {@link Schema}. + */ private StructuredRecord extractFaceAnnotationRecord(FaceAnnotation annotation) { Schema faceSchema = getFaceAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(faceSchema); @@ -90,21 +110,21 @@ private StructuredRecord extractFaceAnnotationRecord(FaceAnnotation annotation) String surprise = annotation.getSurpriseLikelihood().name(); builder.set(FaceAnnotationSchema.SURPRISE_FIELD_NAME, surprise); } - Schema.Field positionField = faceSchema.getField(FaceAnnotationSchema.POSITION_FIELD_NAME); + Schema.Field positionField = faceSchema.getField(FaceAnnotationSchema.BOUNDING_POLY_NAME); if (positionField != null) { Schema positionSchema = getComponentSchema(positionField); List position = annotation.getBoundingPoly().getVerticesList().stream() .map(v -> extractVertex(v, positionSchema)) .collect(Collectors.toList()); - builder.set(FaceAnnotationSchema.POSITION_FIELD_NAME, position); + builder.set(FaceAnnotationSchema.BOUNDING_POLY_NAME, position); } - Schema.Field fdPositionField = faceSchema.getField(FaceAnnotationSchema.FD_POSITION_FIELD_NAME); + Schema.Field fdPositionField = faceSchema.getField(FaceAnnotationSchema.FD_BOUNDING_POLY_NAME); if (fdPositionField != null) { Schema positionSchema = getComponentSchema(fdPositionField); List position = annotation.getFdBoundingPoly().getVerticesList().stream() .map(v -> extractVertex(v, positionSchema)) .collect(Collectors.toList()); - builder.set(FaceAnnotationSchema.FD_POSITION_FIELD_NAME, position); + builder.set(FaceAnnotationSchema.FD_BOUNDING_POLY_NAME, position); } Schema.Field landmarksField = faceSchema.getField(FaceAnnotationSchema.LANDMARKS_FIELD_NAME); if (landmarksField != null) { @@ -118,6 +138,13 @@ private StructuredRecord extractFaceAnnotationRecord(FaceAnnotation annotation) return builder.build(); } + /** + * Extract a {@link StructuredRecord} from a {@link FaceAnnotation} input. + * + * @param landmark The {@link FaceAnnotation.Landmark} object to get the data from. + * @param schema The {@link Schema} to use for the mapping. + * @return {@link StructuredRecord} containing the data mapped to the {@link Schema}. + */ private StructuredRecord extractLandmark(FaceAnnotation.Landmark landmark, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(FaceAnnotationSchema.FaceLandmark.TYPE_FIELD_NAME) != null) { diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index efd371b..8378c5e --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformer.java @@ -40,6 +40,13 @@ public FullTextAnnotationsToRecordTransformer(Schema schema, String outputFieldN super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link FullTextAnnotationSchema}. This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { TextAnnotation annotation = annotateImageResponse.getFullTextAnnotation(); @@ -48,6 +55,13 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link StructuredRecord} containing the handwriting information from a {@link TextAnnotation} input + * using a {@link io.cdap.plugin.cloud.vision.transform.schema.TextAnnotationSchema}. + * + * @param annotation A {@link TextAnnotation} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractHandwritingAnnotation(TextAnnotation annotation) { Schema hwSchema = getHandwritingAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(hwSchema); @@ -67,6 +81,14 @@ private StructuredRecord extractHandwritingAnnotation(TextAnnotation annotation) return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the page information from a {@link Page} input + * using a {@link Schema} for the mapping. + * + * @param page The {@link Page} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractPage(Page page, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(FullTextAnnotationSchema.TextPage.TEXT_FIELD_NAME) != null) { @@ -114,6 +136,14 @@ private StructuredRecord extractPage(Page page, Schema schema) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the block information from a {@link Block} input + * using a {@link Schema} for the mapping. + * + * @param block The {@link Block} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractBlock(Block block, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(FullTextAnnotationSchema.TextBlock.TEXT_FIELD_NAME) != null) { @@ -164,6 +194,14 @@ private StructuredRecord extractBlock(Block block, Schema schema) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the paragraph information from a {@link Paragraph} input + * using a {@link Schema} for the mapping. + * + * @param paragraph The {@link Paragraph} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractParagraph(Paragraph paragraph, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(FullTextAnnotationSchema.TextParagraph.TEXT_FIELD_NAME) != null) { @@ -209,6 +247,14 @@ private StructuredRecord extractParagraph(Paragraph paragraph, Schema schema) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the paragraph information from a {@link Word} input + * using a {@link Schema} for the mapping. + * + * @param word The {@link Word} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractWord(Word word, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(FullTextAnnotationSchema.TextWord.TEXT_FIELD_NAME) != null) { @@ -252,6 +298,14 @@ private StructuredRecord extractWord(Word word, Schema schema) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the paragraph information from a {@link Symbol} input + * using a {@link Schema} for the mapping. + * + * @param symbol The {@link Symbol} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractSymbol(Symbol symbol, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(FullTextAnnotationSchema.TextSymbol.TEXT_FIELD_NAME) != null) { @@ -284,10 +338,18 @@ private StructuredRecord extractSymbol(Symbol symbol, Schema schema) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the detected language information from a + * {@link TextAnnotation.DetectedLanguage} input using a {@link Schema} for the mapping. + * + * @param language The {@link Symbol} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractDetectedLanguage(TextAnnotation.DetectedLanguage language, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); - if (schema.getField(FullTextAnnotationSchema.DetectedLanguage.CODE_FIELD_NAME) != null) { - builder.set(FullTextAnnotationSchema.DetectedLanguage.CODE_FIELD_NAME, language.getLanguageCode()); + if (schema.getField(FullTextAnnotationSchema.DetectedLanguage.LANGUAGE_CODE_FIELD_NAME) != null) { + builder.set(FullTextAnnotationSchema.DetectedLanguage.LANGUAGE_CODE_FIELD_NAME, language.getLanguageCode()); } if (schema.getField(FullTextAnnotationSchema.DetectedLanguage.CONFIDENCE_FIELD_NAME) != null) { builder.set(FullTextAnnotationSchema.DetectedLanguage.CONFIDENCE_FIELD_NAME, language.getConfidence()); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImageAnnotationToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImageAnnotationToRecordTransformer.java old mode 100644 new mode 100755 index c1fb7a8..f494049 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImageAnnotationToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImageAnnotationToRecordTransformer.java @@ -71,18 +71,29 @@ public ImageAnnotationToRecordTransformer(Schema schema, String outputFieldName) */ protected StructuredRecord.Builder getOutputRecordBuilder(StructuredRecord input) { Schema inputRecordSchema = input.getSchema(); + // schema is the output schema StructuredRecord.Builder outputRecordBuilder = StructuredRecord.builder(schema); - for (Schema.Field field : schema.getFields()) { - if (inputRecordSchema.getField(field.getName()) == null) { - continue; + // Loop through the input fields and copy them to the output + for (Schema.Field inputField : inputRecordSchema.getFields()) { + if (schema.getField(inputField.getName()) == null) { + continue; // Not found at the output } + // copy input record field values - outputRecordBuilder.set(field.getName(), input.get(field.getName())); + outputRecordBuilder.set(inputField.getName(), input.get(inputField.getName())); } return outputRecordBuilder; } + /** + * Extract a {@link StructuredRecord} containing the vertex information from a + * {@link Vertex} input using a {@link Schema} for the mapping. + * + * @param vertex The {@link Vertex} object containing the data. + * @param schema The {@link Schema} to use for the mapping of the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ protected StructuredRecord extractVertex(Vertex vertex, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(VertexSchema.X_FIELD_NAME) != null) { diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImagePropertiesAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImagePropertiesAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index 4a4b74f..9cee518 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImagePropertiesAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ImagePropertiesAnnotationsToRecordTransformer.java @@ -35,6 +35,13 @@ public ImagePropertiesAnnotationsToRecordTransformer(Schema schema, String outpu super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link ColorInfoSchema}. This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) @@ -42,12 +49,26 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link StructuredRecord} containing the dominant colors information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotateImageResponse The {@link AnnotateImageResponse} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private List extractDominantColors(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getImagePropertiesAnnotation().getDominantColors().getColorsList().stream() .map(this::extractColorInfoRecord) .collect(Collectors.toList()); } + /** + * Extract a {@link StructuredRecord} containing the color information from a + * {@link ColorInfo} input using a {@link Schema} for the mapping. + * + * @param colorInfo The {@link ColorInfo} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractColorInfoRecord(ColorInfo colorInfo) { Schema faceSchema = getColorInfoSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(faceSchema); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LabelAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LabelAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index 2d32da3..94ced78 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LabelAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LabelAnnotationsToRecordTransformer.java @@ -37,6 +37,14 @@ public LabelAnnotationsToRecordTransformer(Schema schema, String outputFieldName super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link io.cdap.plugin.cloud.vision.transform.schema.LocalizedObjectAnnotationSchema}. + * This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) @@ -44,12 +52,26 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link List} of {@link StructuredRecord} containing the label information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotateImageResponse The {@link AnnotateImageResponse} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private List extractLabelAnnotations(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getLabelAnnotationsList().stream() .map(this::extractAnnotation) .collect(Collectors.toList()); } + /** + * Extract a {@link List} of {@link StructuredRecord} containing the entity annotation information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotation The {@link EntityAnnotation} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ protected StructuredRecord extractAnnotation(EntityAnnotation annotation) { Schema labelSchema = getEntityAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(labelSchema); @@ -88,6 +110,14 @@ protected StructuredRecord extractAnnotation(EntityAnnotation annotation) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the location information from a + * {@link LocationInfo} input using a {@link Schema} for the mapping. + * + * @param locationInfo The {@link LocationInfo} object containing the data. + * @param schema The {@link Schema} to use for the mapping. + * @return A {@link StructuredRecord} containing the data mapped. + */ protected StructuredRecord extractLocation(LocationInfo locationInfo, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(EntityAnnotationSchema.LocationInfo.LATITUDE_FIELD_NAME) != null) { @@ -102,6 +132,14 @@ protected StructuredRecord extractLocation(LocationInfo locationInfo, Schema sch return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the property information from a + * {@link Property} input using a {@link Schema} for the mapping. + * + * @param property The {@link Property} object containing the data. + * @param schema The {@link Schema} to use for the mapping. + * @return A {@link StructuredRecord} containing the data mapped. + */ protected StructuredRecord extractProperty(Property property, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(EntityAnnotationSchema.Property.NAME_FIELD_NAME) != null) { diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LandmarkAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LandmarkAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index 76f08e4..a3c390a --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LandmarkAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LandmarkAnnotationsToRecordTransformer.java @@ -36,18 +36,40 @@ public LandmarkAnnotationsToRecordTransformer(Schema schema, String outputFieldN } @Override + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link io.cdap.plugin.cloud.vision.transform.schema.LocalizedObjectAnnotationSchema}. + * This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) .set(outputFieldName, extractLandmarkAnnotations(annotateImageResponse)) .build(); } + /** + * Extract a {@link List} of {@link StructuredRecord} containing the landmark information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotateImageResponse The {@link AnnotateImageResponse} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private List extractLandmarkAnnotations(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getLandmarkAnnotationsList().stream() .map(this::extractAnnotation) .collect(Collectors.toList()); } + /** + * Extract a {@link StructuredRecord} containing the entity information from a + * {@link EntityAnnotation} input using a {@link Schema} for the mapping. + * + * @param annotation The {@link EntityAnnotation} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ @Override protected StructuredRecord extractAnnotation(EntityAnnotation annotation) { Schema landmarkSchema = getEntityAnnotationSchema(); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LocalizedObjectAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LocalizedObjectAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index 610a303..faf7d25 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LocalizedObjectAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LocalizedObjectAnnotationsToRecordTransformer.java @@ -35,6 +35,14 @@ public LocalizedObjectAnnotationsToRecordTransformer(Schema schema, String outpu super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link io.cdap.plugin.cloud.vision.transform.schema.LocalizedObjectAnnotationSchema}. + * This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) @@ -42,12 +50,26 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link List} of {@link StructuredRecord} containing the localized object information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotateImageResponse The {@link AnnotateImageResponse} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private List extractLocalizedObjectAnnotations(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getLocalizedObjectAnnotationsList().stream() .map(this::extractLocalizedObjectAnnotationRecord) .collect(Collectors.toList()); } + /** + * Extract a {@link StructuredRecord} containing the localized object information from a + * {@link LocalizedObjectAnnotation} input using a {@link Schema} for the mapping. + * + * @param annotation The {@link LocalizedObjectAnnotation} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractLocalizedObjectAnnotationRecord(LocalizedObjectAnnotation annotation) { Schema objSchema = getLocalizedObjectAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(objSchema); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LogoAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LogoAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index 9888050..b9a4c26 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LogoAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/LogoAnnotationsToRecordTransformer.java @@ -19,7 +19,6 @@ import com.google.cloud.vision.v1.AnnotateImageResponse; import io.cdap.cdap.api.data.format.StructuredRecord; import io.cdap.cdap.api.data.schema.Schema; - import java.util.List; import java.util.stream.Collectors; @@ -34,6 +33,14 @@ public LogoAnnotationsToRecordTransformer(Schema schema, String outputFieldName) super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link io.cdap.plugin.cloud.vision.transform.schema.LocalizedObjectAnnotationSchema}. + * This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) @@ -41,6 +48,13 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link List} of {@link StructuredRecord} containing the logo information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotateImageResponse The {@link AnnotateImageResponse} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private List extractLogoAnnotations(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getLogoAnnotationsList().stream() .map(this::extractAnnotation) diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ProductSearchResultToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ProductSearchResultToRecordTransformer.java old mode 100644 new mode 100755 index 69f335d..f8e69e0 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ProductSearchResultToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/ProductSearchResultToRecordTransformer.java @@ -37,6 +37,14 @@ public ProductSearchResultToRecordTransformer(Schema schema, String outputFieldN super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link ProductSearchResultsSchema}. This {@link StructuredRecord} can then be turned into + * a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { ProductSearchResults productSearchResults = annotateImageResponse.getProductSearchResults(); @@ -45,6 +53,13 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link StructuredRecord} containing the product search information from a + * {@link ProductSearchResults} input using a {@link Schema} for the mapping. + * + * @param searchResults The {@link ProductSearchResults} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractProductSearchResults(ProductSearchResults searchResults) { Schema schema = getProductSearchResultSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(schema); @@ -74,6 +89,13 @@ private StructuredRecord extractProductSearchResults(ProductSearchResults search return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the search result information from a + * {@link ProductSearchResults.Result} input using a {@link Schema} for the mapping. + * + * @param result The {@link ProductSearchResults.Result} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractProductSearchResultRecord(ProductSearchResults.Result result, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(ProductSearchResultsSchema.Result.IMAGE_FIELD_NAME) != null) { @@ -93,6 +115,14 @@ private StructuredRecord extractProductSearchResultRecord(ProductSearchResults.R return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the product information from a + * {@link Product} input using a {@link Schema} for the mapping. + * + * @param product The {@link Product} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractProductRecord(Product product, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(ProductSearchResultsSchema.Product.NAME_FIELD_NAME) != null) { @@ -119,6 +149,14 @@ private StructuredRecord extractProductRecord(Product product, Schema schema) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the label information from a + * {@link Product.KeyValue} input using a {@link Schema} for the mapping. + * + * @param label The {@link Product.KeyValue} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractProductLabelRecord(Product.KeyValue label, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(ProductSearchResultsSchema.KeyValue.KEY_FIELD_NAME) != null) { @@ -130,6 +168,14 @@ private StructuredRecord extractProductLabelRecord(Product.KeyValue label, Schem return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the grouped result information from a + * {@link Product.KeyValue} input using a {@link Schema} for the mapping. + * + * @param result The {@link ProductSearchResults.GroupedResult} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractProductSearchResultRecord(ProductSearchResults.GroupedResult result, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); Schema.Field positionField = schema.getField(ProductSearchResultsSchema.GroupedResult.POSITION_FIELD_NAME); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/SafeSearchAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/SafeSearchAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index be1b1d4..469bb25 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/SafeSearchAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/SafeSearchAnnotationsToRecordTransformer.java @@ -33,6 +33,14 @@ public SafeSearchAnnotationsToRecordTransformer(Schema schema, String outputFiel super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link SafeSearchAnnotationSchema}. This {@link StructuredRecord} can then be turned into + * a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { SafeSearchAnnotation annotation = annotateImageResponse.getSafeSearchAnnotation(); @@ -41,6 +49,13 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link StructuredRecord} containing the safe search information from a + * {@link SafeSearchAnnotation} input using a {@link Schema} for the mapping. + * + * @param annotation The {@link SafeSearchAnnotation} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractSafeSearchAnnotation(SafeSearchAnnotation annotation) { Schema safeSearchAnnotationSchema = getSafeSearchAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(safeSearchAnnotationSchema); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TextAnnotationsToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TextAnnotationsToRecordTransformer.java old mode 100644 new mode 100755 index fbefeaa..219212b --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TextAnnotationsToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TextAnnotationsToRecordTransformer.java @@ -35,6 +35,13 @@ public TextAnnotationsToRecordTransformer(Schema schema, String outputFieldName) super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link TextAnnotationSchema}. This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { return getOutputRecordBuilder(input) @@ -42,12 +49,26 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link List} of {@link StructuredRecord} containing the text information from a + * {@link AnnotateImageResponse} input using a {@link Schema} for the mapping. + * + * @param annotateImageResponse The {@link AnnotateImageResponse} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private List extractTextAnnotations(AnnotateImageResponse annotateImageResponse) { return annotateImageResponse.getTextAnnotationsList().stream() .map(this::extractTextAnnotationRecord) .collect(Collectors.toList()); } + /** + * Extract a {@link StructuredRecord} containing the text annotation information from a + * {@link EntityAnnotation} input using a {@link Schema} for the mapping. + * + * @param annotation The {@link EntityAnnotation} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractTextAnnotationRecord(EntityAnnotation annotation) { Schema textSchema = getTextAnnotationSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(textSchema); diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TransformerFactory.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TransformerFactory.java old mode 100644 new mode 100755 index 52cd1b9..0cc0a59 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TransformerFactory.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/TransformerFactory.java @@ -19,8 +19,8 @@ import io.cdap.plugin.cloud.vision.transform.ImageFeature; /** - * A factory which creates instance of {@ImageAnnotationToRecordTransformer} in accordance to feature and output schema - * configured in input config. + * A factory which creates instances of {@ImageAnnotationToRecordTransformer} in accordance to the feature and output + * schema configured in input config. */ public class TransformerFactory { public static ImageAnnotationToRecordTransformer createInstance(ImageFeature imageFeature, String outputFieldName, diff --git a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/WebDetectionToRecordTransformer.java b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/WebDetectionToRecordTransformer.java old mode 100644 new mode 100755 index 82b41f1..d4fb683 --- a/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/WebDetectionToRecordTransformer.java +++ b/src/main/java/io/cdap/plugin/cloud/vision/transform/transformer/WebDetectionToRecordTransformer.java @@ -24,7 +24,6 @@ import java.util.List; import java.util.stream.Collectors; - /** * Transforms web detection of specified {@link AnnotateImageResponse} to {@link StructuredRecord} according * to the specified schema. @@ -35,6 +34,13 @@ public WebDetectionToRecordTransformer(Schema schema, String outputFieldName) { super(schema, outputFieldName); } + /** + * Extract the entire mapping of a {@link AnnotateImageResponse} object to a {@link StructuredRecord} + * using the {@link WebDetectionSchema}. This {@link StructuredRecord} can then be turned into a json document. + * + * @param input {@link StructuredRecord} to add to. + * @param annotateImageResponse {@link AnnotateImageResponse} to get the data from. + */ @Override public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse annotateImageResponse) { WebDetection webDetection = annotateImageResponse.getWebDetection(); @@ -43,6 +49,13 @@ public StructuredRecord transform(StructuredRecord input, AnnotateImageResponse .build(); } + /** + * Extract a {@link StructuredRecord} containing the Web detection information from a + * {@link WebDetection} input using a {@link Schema} for the mapping. + * + * @param webDetection The {@link WebDetection} object containing the data. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractWebDetection(WebDetection webDetection) { Schema webSchema = getWebDetectionSchema(); StructuredRecord.Builder builder = StructuredRecord.builder(webSchema); @@ -103,6 +116,14 @@ private StructuredRecord extractWebDetection(WebDetection webDetection) { return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the Web entity information from a + * {@link WebDetection.WebEntity} input using a {@link Schema} for the mapping. + * + * @param webEntity The {@link WebDetection.WebEntity} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractEntity(WebDetection.WebEntity webEntity, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(WebDetectionSchema.WebEntity.ENTITY_ID_FIELD_NAME) != null) { @@ -118,6 +139,14 @@ private StructuredRecord extractEntity(WebDetection.WebEntity webEntity, Schema return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the Web image information from a + * {@link WebDetection.WebImage} input using a {@link Schema} for the mapping. + * + * @param webImage The {@link WebDetection.WebImage} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractWebImage(WebDetection.WebImage webImage, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(WebDetectionSchema.WebImage.URL_FIELD_NAME) != null) { @@ -130,6 +159,14 @@ private StructuredRecord extractWebImage(WebDetection.WebImage webImage, Schema return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the Web label information from a + * {@link WebDetection.WebLabel} input using a {@link Schema} for the mapping. + * + * @param webLabel The {@link WebDetection.WebLabel} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractWebLabel(WebDetection.WebLabel webLabel, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(WebDetectionSchema.BestGuessLabel.LABEL_FIELD_NAME) != null) { @@ -142,6 +179,14 @@ private StructuredRecord extractWebLabel(WebDetection.WebLabel webLabel, Schema return builder.build(); } + /** + * Extract a {@link StructuredRecord} containing the Web page information from a + * {@link WebDetection.WebPage} input using a {@link Schema} for the mapping. + * + * @param webPage The {@link WebDetection.WebPage} object containing the data. + * @param schema The {@link Schema} to use. + * @return A {@link StructuredRecord} containing the data mapped. + */ private StructuredRecord extractWebPage(WebDetection.WebPage webPage, Schema schema) { StructuredRecord.Builder builder = StructuredRecord.builder(schema); if (schema.getField(WebDetectionSchema.WebPage.URL_FIELD_NAME) != null) { diff --git a/src/test/java/io/cdap/plugin/cloud/vision/CloudVisionConfigBuilder.java b/src/test/java/io/cdap/plugin/cloud/vision/CloudVisionConfigBuilder.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/GcsBucketHelperTest.java b/src/test/java/io/cdap/plugin/cloud/vision/GcsBucketHelperTest.java new file mode 100644 index 0000000..07b7a68 --- /dev/null +++ b/src/test/java/io/cdap/plugin/cloud/vision/GcsBucketHelperTest.java @@ -0,0 +1,67 @@ +/* + * Copyright © 2019 Cask Data, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not + * use this file except in compliance with the License. You may obtain a copy of + * the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the + * License for the specific language governing permissions and limitations under + * the License. + */ + +package io.cdap.plugin.cloud.vision; + +import com.google.auth.Credentials; +import com.google.cloud.storage.Blob; +import io.cdap.plugin.cloud.vision.action.GcsBucketHelper; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; +import java.util.List; + +/** + * Test that we can access blobs using a GCSPath and that the batching computation doesn't trigger + * an exception. + */ +public class GcsBucketHelperTest { + protected static final String PATH = System.getProperty("path", "gs://vision-api-pbo2/images"); + protected static final String SERVICE_ACCOUNT_FILE_PATH = System.getProperty("serviceFilePath", + "auto-detect"); + // This is a limit imposed by the Cloud Vision API + protected static final int MAX_NUMBER_OF_IMAGES_PER_BATCH = 2000; + + @Test + public void getAllFilesInPath() throws Exception { + Credentials credentials = CredentialsHelper.getCredentials(SERVICE_ACCOUNT_FILE_PATH); + + List blobs = GcsBucketHelper.getAllFilesInPath(PATH, credentials); + if (blobs.isEmpty()) { + throw new Exception("Cannot get blobs in: " + PATH); + } + + System.out.println(); + System.out.println("Using service acount path: " + SERVICE_ACCOUNT_FILE_PATH); + System.out.println("Blobs found in: " + PATH); + for (Blob blob : blobs) { + System.out.println("blob.getName(): " + blob.getName()); + } + + int countBlobs = 0; + for (int batchId = 0; batchId < (1 + blobs.size() / MAX_NUMBER_OF_IMAGES_PER_BATCH); batchId++) { + for (int index = batchId * MAX_NUMBER_OF_IMAGES_PER_BATCH; + (index < (batchId + 1) * MAX_NUMBER_OF_IMAGES_PER_BATCH) && (index < blobs.size()); + index++) { + countBlobs++; + } + } + + // Make sure we have taken into the loop as many blobs as returned from GCS + Assert.assertEquals(countBlobs, blobs.size()); + } +} diff --git a/src/test/java/io/cdap/plugin/cloud/vision/ValidationAssertions.java b/src/test/java/io/cdap/plugin/cloud/vision/ValidationAssertions.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfigTest.java b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfigTest.java old mode 100644 new mode 100755 index ad13ad8..1f87199 --- a/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfigTest.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionConfigTest.java @@ -23,7 +23,6 @@ import io.cdap.plugin.cloud.vision.transform.ImageFeature; import org.junit.Assert; import org.junit.Test; - import java.util.Collections; import java.util.Iterator; import java.util.List; @@ -122,6 +121,30 @@ public void testValidateAspectRatios() { assertValidationFailed(collector, paramNames); } + @Test + public void testValidateOneLanguageHint() { + // Those are all the language ids retrieved from 'widgets/DocumentExtractor-transform.json' + String[] languages = new String[]{"af", "sq", "ar", "hy", "be", "bn", "bg", "ca", "zh", "hr", "cs", "da", "nl", + "en", "et", "fil", "fi", "fr", "de", "el", "gu", "iw", "hu", "is", "id", "it", "ja", "kn", "km", "ko", "lo", + "lv", "lt", "mk", "ms", "ml", "mr", "ne", "no", "fa", "pl", "pt", "pa", "ro", "ru", "ru-PETR1708", "sr", + "sr-Latn", "sk", "sl", "es", "sv", "ta", "te", "th", "tr", "uk", "vi", "yi"}; + + MockFailureCollector collector = new MockFailureCollector(MOCK_STAGE); + List> paramNames = Collections.singletonList( + Collections.singletonList(ActionConstants.LANGUAGE_HINTS) + ); + + // Validate the languages one by one + for (String language : languages) { + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(VALID_CONFIG) + .setFeatures(ImageFeature.TEXT.getDisplayName()) + .setLanguageHints(language) + .build(); + config.validate(collector); + } + Assert.assertTrue(collector.getValidationFailures().isEmpty()); + } + private void assertValidationFailed(MockFailureCollector failureCollector, List> paramNames) { List failureList = failureCollector.getValidationFailures(); Assert.assertEquals(paramNames.size(), failureList.size()); diff --git a/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionTest.java b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionTest.java old mode 100644 new mode 100755 index cb6eafd..49e110e --- a/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionTest.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineImageExtractorActionTest.java @@ -29,8 +29,8 @@ import io.cdap.cdap.etl.mock.action.MockActionContext; import io.cdap.plugin.cloud.vision.CloudVisionConstants; import io.cdap.plugin.cloud.vision.CredentialsHelper; -import io.cdap.plugin.cloud.vision.source.GCSPath; import io.cdap.plugin.cloud.vision.transform.ImageFeature; +import io.cdap.plugin.gcp.gcs.GCSPath; import org.junit.After; import org.junit.Assert; import org.junit.Before; @@ -38,6 +38,7 @@ import java.io.FileInputStream; import java.io.FileNotFoundException; +import java.util.ArrayList; import javax.annotation.Nullable; /** @@ -46,27 +47,51 @@ public class OfflineImageExtractorActionTest { private static final String PROJECT = System.getProperty("project", CloudVisionConstants.AUTO_DETECT); - private static final String SERVICE_ACCOUNT_FILE_PATH = - System.getProperty("serviceFilePath", CloudVisionConstants.AUTO_DETECT); - private static final String PATH = System.getProperty("path", - "gs://cloud-vision-cdap-text-image-extractor-offline"); + + private static final String SERVICE_ACCOUNT_FILE_PATH = System.getProperty("serviceFilePath", + CloudVisionConstants.AUTO_DETECT); + private static final String PATH = System.getProperty("path", "gs://cloud-vision-cdap-tests"); + private static final String IMAGE_PATH_IN_BUCKET = "images"; + private static final String PATH_PATTERN = "%s/%s/"; private static final String RESULT_PATH_PATTERN = "%s/%s"; private static final String JPG_CONTENT_TYPE = "image/jpeg"; - private static final String OBJECT_IMAGE_NAME = "multiple_objects.jpg"; - private static final String SAFE_IMAGE_NAME = "safe_search.jpg"; - private static final String CROP_HINTS_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/crop_hints/bubble.jpeg"; - private static final String FACE_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/face/faces.jpeg"; - private static final String PROPERTIES_IMAGE_FILE_PATH = - "gs://cloud-samples-data/vision/image_properties/bali.jpeg"; - private static final String LABELS_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/label/setagaya.jpeg"; - private static final String LANDMARKS_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/landmark/st_basils.jpeg"; - private static final String LOGOS_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/logo/google_logo.jpg"; - private static final String OBJECTS_IMAGE_FILE_PATH = String.format(RESULT_PATH_PATTERN, PATH, OBJECT_IMAGE_NAME); - private static final String SAFE_SEARCH_IMAGE_FILE_PATH = String.format(RESULT_PATH_PATTERN, PATH, SAFE_IMAGE_NAME); - private static final String TEXT_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/ocr/sign.jpg"; - private static final String WEB_DETECTION_IMAGE_FILE_PATH = "gs://cloud-samples-data/vision/web/carnaval.jpeg"; + + private static final String LOCAL_IMAGE_FOLDER_PATH = "examples/images/jpg"; + + private static final String CROP_HINTS_IMAGE_NAME = "bubble.jpg"; + private static final String FACE_IMAGE_NAME = "faces.jpg"; + private static final String LABELS_IMAGE_NAME = "setagaya.jpg"; + private static final String LANDMARKS_IMAGE_NAME = "st_basils.jpg"; + private static final String LOGOS_IMAGE_NAME = "google_logo.jpg"; + private static final String OBJECT_IMAGE_NAME = "fallon-michael-8LKQfBumjMo.jpg"; + private static final String PROPERTIES_IMAGE_NAME = "bali.jpg"; + private static final String SAFE_SEARCH_IMAGE_NAME = "keyur-nandaniya-oEgiJNbYw8w.jpg"; + private static final String TEXT_IMAGE_NAME = "sign.jpg"; + private static final String WEB_DETECTION_IMAGE_NAME = "carnaval.jpg"; + + private static final String CROP_HINTS_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + CROP_HINTS_IMAGE_NAME; + private static final String FACE_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + FACE_IMAGE_NAME; + private static final String LABELS_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + LABELS_IMAGE_NAME; + private static final String LANDMARKS_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + LANDMARKS_IMAGE_NAME; + private static final String LOGOS_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + LOGOS_IMAGE_NAME; + private static final String OBJECTS_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + OBJECT_IMAGE_NAME; + private static final String PROPERTIES_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + PROPERTIES_IMAGE_NAME; + private static final String SAFE_SEARCH_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + SAFE_SEARCH_IMAGE_NAME; + private static final String TEXT_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + "/" + + TEXT_IMAGE_NAME; + private static final String WEB_DETECTION_IMAGE_FILE_PATH = PATH + "/" + IMAGE_PATH_IN_BUCKET + + "/" + WEB_DETECTION_IMAGE_NAME; + private static final String RESULT_FILE_NAME = "output-1-to-1.json"; private static final String BATCH_SIZE = "20"; @@ -85,6 +110,42 @@ public class OfflineImageExtractorActionTest { private static Storage storage; private static Bucket bucket; + private static final String FOLDER_CONTENT_TYPE = "application/x-www-form-urlencoded;charset=UTF-8"; + + private static void createFolder(String folderName) throws FileNotFoundException { + System.out.println("Creating folder: " + folderName); + FileInputStream fileInputStream = new FileInputStream("examples/empty_file.txt"); + bucket.create(folderName + "/", + fileInputStream, + FOLDER_CONTENT_TYPE, + Bucket.BlobWriteOption.doesNotExist()); + } + + private static void uploadImages() throws FileNotFoundException { + ArrayList allImages = new ArrayList<>(10); + allImages.add(CROP_HINTS_IMAGE_NAME); + allImages.add(FACE_IMAGE_NAME); + allImages.add(LABELS_IMAGE_NAME); + allImages.add(LANDMARKS_IMAGE_NAME); + allImages.add(LOGOS_IMAGE_NAME); + allImages.add(OBJECT_IMAGE_NAME); + allImages.add(PROPERTIES_IMAGE_NAME); + allImages.add(SAFE_SEARCH_IMAGE_NAME); + allImages.add(TEXT_IMAGE_NAME); + allImages.add(WEB_DETECTION_IMAGE_NAME); + + for (String imageName : allImages) { + String localImagePath = LOCAL_IMAGE_FOLDER_PATH + "/" + imageName; + + FileInputStream fileInputStream = new FileInputStream(localImagePath); + System.out.println("Uploading " + localImagePath + " to " + bucket.getName()); + bucket.create(IMAGE_PATH_IN_BUCKET + "/" + imageName, + fileInputStream, + JPG_CONTENT_TYPE, + Bucket.BlobWriteOption.doesNotExist()); + } + } + @Before public void testSetup() throws Exception { Credentials credentials = CredentialsHelper.getCredentials(SERVICE_ACCOUNT_FILE_PATH); @@ -98,12 +159,13 @@ public void testSetup() throws Exception { deleteBucket(storage, bucket); } + System.out.println("Creating bucket: " + bucketName); BucketInfo bucketInfo = BucketInfo.newBuilder(bucketName) .setStorageClass(StorageClass.STANDARD) - .setLocation("us-central1") + .setLocation("us-east1") .build(); bucket = storage.create(bucketInfo); - initTestsData(); + uploadImages(); } @After @@ -111,22 +173,14 @@ public void destroy() { deleteBucket(storage, bucket); } - private static void initTestsData() throws FileNotFoundException { - String path = String.format("src/test/resources/%s", OBJECT_IMAGE_NAME); - - FileInputStream serviceAccountStream = new FileInputStream(path); - bucket.create(OBJECT_IMAGE_NAME, serviceAccountStream, JPG_CONTENT_TYPE, Bucket.BlobWriteOption.doesNotExist()); - - path = String.format("src/test/resources/%s", SAFE_IMAGE_NAME); - - serviceAccountStream = new FileInputStream(path); - bucket.create(SAFE_IMAGE_NAME, serviceAccountStream, JPG_CONTENT_TYPE, Bucket.BlobWriteOption.doesNotExist()); - } - @Test public void testRunCropHintsFeature() throws Exception { String folder = "crop_hints"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunCropHintsFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(CROP_HINTS_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -145,6 +199,10 @@ public void testRunCropHintsFeature() throws Exception { public void testRunFaceFeature() throws Exception { String folder = "face"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunFaceFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(FACE_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -161,6 +219,10 @@ public void testRunFaceFeature() throws Exception { public void testRunImagePropertiesFeature() throws Exception { String folder = "properties"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunImagePropertiesFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(PROPERTIES_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -178,6 +240,10 @@ public void testRunImagePropertiesFeature() throws Exception { public void testRunLabelsFeature() throws Exception { String folder = "labels"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunLabelsFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(LABELS_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -195,6 +261,10 @@ public void testRunLabelsFeature() throws Exception { public void testRunLandmarksFeature() throws Exception { String folder = "landmarks"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunLandmarksFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(LANDMARKS_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -212,6 +282,10 @@ public void testRunLandmarksFeature() throws Exception { public void testRunLogosFeature() throws Exception { String folder = "logos"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + + createFolder(folder); + + System.out.println("testRunLogosFeature() - resultFolderPath: " + resultFolderPath); OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(LOGOS_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -229,6 +303,10 @@ public void testRunLogosFeature() throws Exception { public void testRunObjectLocalizationFeature() throws Exception { String folder = "objects"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunObjectLocalizationFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(OBJECTS_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -246,6 +324,10 @@ public void testRunObjectLocalizationFeature() throws Exception { public void testRunTextFeature() throws Exception { String folder = "text"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunTextFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(TEXT_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -264,6 +346,10 @@ public void testRunTextFeature() throws Exception { public void testRunSafeSearchFeature() throws Exception { String folder = "safe"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunSafeSearchFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(SAFE_SEARCH_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -281,6 +367,10 @@ public void testRunSafeSearchFeature() throws Exception { public void testRunWebDetectionFeature() throws Exception { String folder = "web"; String resultFolderPath = String.format(PATH_PATTERN, PATH, folder); + System.out.println("testRunSafeSearchFeature() - resultFolderPath: " + resultFolderPath); + + createFolder(folder); + OfflineImageExtractorActionConfig config = OfflineImageExtractorActionConfig.builder(CONFIG) .setSourcePath(WEB_DETECTION_IMAGE_FILE_PATH) .setDestinationPath(resultFolderPath) @@ -295,9 +385,10 @@ public void testRunWebDetectionFeature() throws Exception { } private void validateResult(String folder, String expectedUri) throws InterruptedException { - Thread.sleep(30000); - - Blob blob = bucket.get(String.format(RESULT_PATH_PATTERN, folder, RESULT_FILE_NAME)); + Thread.sleep(10000); // Wait a bit to make sure the blob is available in the bucket + String blobPath = String.format(RESULT_PATH_PATTERN, folder, RESULT_FILE_NAME); + System.out.println("validateResult() - blobPath: " + blobPath); + Blob blob = bucket.get(blobPath); Assert.assertTrue(blob.exists()); String content = new String(blob.getContent()); @@ -325,6 +416,10 @@ private static Storage getStorage(String project, @Nullable Credentials credenti } private static void deleteBucket(Storage storage, Bucket bucket) { + if (bucket == null || bucket.list() == null) { + return; + } + System.out.println("Deleting bucket: " + bucket.getName()); for (Blob blob : bucket.list().iterateAll()) { storage.delete(blob.getBlobId()); } diff --git a/src/test/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionConfigTest.java b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionConfigTest.java similarity index 79% rename from src/test/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionConfigTest.java rename to src/test/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionConfigTest.java index 498cb83..c54753d 100644 --- a/src/test/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionConfigTest.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionConfigTest.java @@ -23,11 +23,11 @@ import org.junit.Test; /** - * Test class for {@link TextExtractorActionConfigTest}. + * Test class for {@link OfflineTextExtractorActionConfigTest}. */ -public class TextExtractorActionConfigTest { +public class OfflineTextExtractorActionConfigTest { private static final String MOCK_STAGE = "mockStage"; - private static final TextExtractorActionConfig VALID_CONFIG = new TextExtractorActionConfig( + private static final OfflineTextExtractorActionConfig VALID_CONFIG = new OfflineTextExtractorActionConfig( "/path", "/path", "/path", @@ -45,19 +45,19 @@ public void testValidConfig() { @Test public void testEmptyServiceFilePath() { - TextExtractorActionConfig config = TextExtractorActionConfig.builder(VALID_CONFIG) + OfflineTextExtractorActionConfig config = OfflineTextExtractorActionConfig.builder(VALID_CONFIG) .setServiceFilePath("") .build(); MockFailureCollector failureCollector = new MockFailureCollector(MOCK_STAGE); config.validate(failureCollector); ValidationAssertions.assertPropertyValidationFailed(failureCollector, - CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH); + CloudVisionConstants.SERVICE_ACCOUNT_FILE_PATH); } @Test public void testEmptySourcePath() { - TextExtractorActionConfig config = TextExtractorActionConfig.builder(VALID_CONFIG) + OfflineTextExtractorActionConfig config = OfflineTextExtractorActionConfig.builder(VALID_CONFIG) .setSourcePath("") .build(); @@ -68,7 +68,7 @@ public void testEmptySourcePath() { @Test public void testEmptyDestinationPath() { - TextExtractorActionConfig config = TextExtractorActionConfig.builder(VALID_CONFIG) + OfflineTextExtractorActionConfig config = OfflineTextExtractorActionConfig.builder(VALID_CONFIG) .setDestinationPath("") .build(); diff --git a/src/test/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionTest.java b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionTest.java similarity index 92% rename from src/test/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionTest.java rename to src/test/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionTest.java index 2cf1b52..9b4e62c 100644 --- a/src/test/java/io/cdap/plugin/cloud/vision/action/TextExtractorActionTest.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/action/OfflineTextExtractorActionTest.java @@ -28,20 +28,19 @@ import io.cdap.cdap.etl.api.action.ActionContext; import io.cdap.cdap.etl.mock.action.MockActionContext; import io.cdap.plugin.cloud.vision.CredentialsHelper; -import io.cdap.plugin.cloud.vision.source.GCSPath; +import io.cdap.plugin.gcp.gcs.GCSPath; import org.junit.After; import org.junit.Assert; import org.junit.Before; import org.junit.Test; - import java.io.FileInputStream; import java.io.FileNotFoundException; import javax.annotation.Nullable; /** - * Test class for {@link TextExtractorAction}. + * Test class for {@link OfflineTextExtractorAction}. */ -public class TextExtractorActionTest { +public class OfflineTextExtractorActionTest { protected static final String PROJECT = System.getProperty("project", "auto-detect"); protected static final String SERVICE_ACCOUNT_FILE_PATH = System.getProperty("serviceFilePath", "auto-detect"); protected static final String PATH = System.getProperty("path", "gs://cloud-vision-cdap-text-offline"); @@ -94,7 +93,7 @@ private static void initTestsData() throws FileNotFoundException { @Test public void testRun() throws Exception { - TextExtractorActionConfig config = new TextExtractorActionConfig( + OfflineTextExtractorActionConfig config = new OfflineTextExtractorActionConfig( SERVICE_ACCOUNT_FILE_PATH, PDF_FILE_PATH, RESULT_FOLDER_PATH, @@ -103,7 +102,7 @@ public void testRun() throws Exception { null ); - TextExtractorAction action = new TextExtractorAction(config); + OfflineTextExtractorAction action = new OfflineTextExtractorAction(config); ActionContext context = new MockActionContext(); action.run(context); @@ -144,6 +143,9 @@ private static Storage getStorage(String project, @Nullable Credentials credenti } private static void deleteBucket(Storage storage, Bucket bucket) { + if (bucket == null || bucket.list() == null) { + return; + } for (Blob blob : bucket.list().iterateAll()) { storage.delete(blob.getBlobId()); } diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfigBuilder.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfigBuilder.java old mode 100644 new mode 100755 index d34f781..acbeae1 --- a/src/test/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfigBuilder.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfigBuilder.java @@ -20,6 +20,8 @@ /** * Builder class that provides handy methods to construct {@link ExtractorTransformConfig} for testing. + * + * @param Generic type. */ public abstract class ExtractorTransformConfigBuilder extends CloudVisionConfigBuilder { diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfigTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/ExtractorTransformConfigTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfigBuilder.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfigBuilder.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfigTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/document/DocumentExtractorTransformConfigTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfigBuilder.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfigBuilder.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfigTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/image/ImageExtractorTransformConfigTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/BaseAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/BaseAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/CropHintsAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/CropHintsAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 index cf117a9..63d15ee --- a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformerTest.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FaceAnnotationsToRecordTransformerTest.java @@ -152,9 +152,9 @@ private void assertAnnotationEquals(FaceAnnotation expected, StructuredRecord ac Assert.assertEquals(expected.getUnderExposedLikelihood().name(), actual.get(FaceAnnotationSchema.UNDER_EXPOSED_FIELD_NAME)); - List position = actual.get(FaceAnnotationSchema.POSITION_FIELD_NAME); + List position = actual.get(FaceAnnotationSchema.BOUNDING_POLY_NAME); assertPositionEqual(expected.getBoundingPoly(), position); - List fdPosition = actual.get(FaceAnnotationSchema.FD_POSITION_FIELD_NAME); + List fdPosition = actual.get(FaceAnnotationSchema.FD_BOUNDING_POLY_NAME); assertPositionEqual(expected.getFdBoundingPoly(), fdPosition); List landmarks = actual.get(FaceAnnotationSchema.LANDMARKS_FIELD_NAME); assertLandmarksEqual(expected.getLandmarksList(), landmarks); diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 index bf0b9f0..04fe9c9 --- a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformerTest.java +++ b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/FullTextAnnotationsToRecordTransformerTest.java @@ -190,7 +190,7 @@ private void assertLanguageEquals(TextAnnotation.DetectedLanguage expected, Stru actual.get(FullTextAnnotationSchema.DetectedLanguage.CONFIDENCE_FIELD_NAME), DELTA); Assert.assertEquals(expected.getLanguageCode(), - actual.get(FullTextAnnotationSchema.DetectedLanguage.CODE_FIELD_NAME)); + actual.get(FullTextAnnotationSchema.DetectedLanguage.LANGUAGE_CODE_FIELD_NAME)); } private void assertBlockEquals(Block expected, StructuredRecord actual) { diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/ImagePropertiesAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/ImagePropertiesAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LabelAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LabelAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LandmarkAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LandmarkAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LocalizedObjectAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LocalizedObjectAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LogoAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/LogoAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/ProductSearchResultToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/ProductSearchResultToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/SafeSearchAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/SafeSearchAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/TextAnnotationsToRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/TextAnnotationsToRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/WebDetectionRecordTransformerTest.java b/src/test/java/io/cdap/plugin/cloud/vision/transform/transformer/WebDetectionRecordTransformerTest.java old mode 100644 new mode 100755 diff --git a/widgets/TextExtractorOffline-action.json b/widgets/OfflineTextExtractor-action.json similarity index 98% rename from widgets/TextExtractorOffline-action.json rename to widgets/OfflineTextExtractor-action.json index 2571335..f3a7ce8 100644 --- a/widgets/TextExtractorOffline-action.json +++ b/widgets/OfflineTextExtractor-action.json @@ -2,7 +2,7 @@ "metadata": { "spec-version": "1.5" }, - "display-name": "Text Extractor Offline", + "display-name": "Offline Text Extractor", "configuration-groups": [ { "label": "Basic", @@ -33,7 +33,8 @@ "default": "application/pdf", "values": [ "application/pdf", - "image/tiff" + "image/tiff", + "image/gif" ] } },