From 22e5fd00138140728358193ce1a76efe5687565c Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Sat, 14 Oct 2023 21:15:55 +0200 Subject: [PATCH 01/10] Add first implementation of RestrictedSolrClient --- .../solr/RestrictedSolrClient.java | 464 ++++++++++++++++++ 1 file changed, 464 insertions(+) create mode 100644 src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java new file mode 100644 index 00000000..6ef0bd91 --- /dev/null +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java @@ -0,0 +1,464 @@ +/* + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + */ +package dk.kb.netarchivesuite.solrwayback.solr; + +import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader; +import org.apache.solr.client.solrj.SolrClient; +import org.apache.solr.client.solrj.SolrRequest; +import org.apache.solr.client.solrj.SolrServerException; +import org.apache.solr.client.solrj.StreamingResponseCallback; +import org.apache.solr.client.solrj.beans.DocumentObjectBinder; +import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.response.QueryResponse; +import org.apache.solr.client.solrj.response.SolrPingResponse; +import org.apache.solr.client.solrj.response.UpdateResponse; +import org.apache.solr.common.SolrDocument; +import org.apache.solr.common.SolrDocumentList; +import org.apache.solr.common.SolrInputDocument; +import org.apache.solr.common.params.CommonParams; +import org.apache.solr.common.params.ModifiableSolrParams; +import org.apache.solr.common.params.SolrParams; +import org.apache.solr.common.util.NamedList; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.io.IOException; +import java.util.Collection; +import java.util.Iterator; +import java.util.List; +import java.util.Map; + +/** + * SolrClient wrapper that ensures that {@link PropertiesLoader#SOLR_PARAMS_MAP} + * are applied on each call. Also allows for overriding the default collection. + *

+ * All {@link SolrClient}s used in SolrWayback should be wrapped in this! + *

+ * Important: As the {@link #defaultCollection} is explicit, the given {@code inner} {@code SolrClient}s + * must not specify the collection name in its URL. A valid URL would be {@code http://localhost:8983/solr}. + *

+ * Recommended: + */ +public class RestrictedSolrClient extends SolrClient { + private static final Logger log = LoggerFactory.getLogger(RestrictedSolrClient.class); + + /** + * The collection to use for calls where no explicit collection is given. + */ + private final String defaultCollection; + /** + * All calls are delegated to this {@code SolrClient} after request parameters has been restricted. + */ + private final SolrClient inner; + /** + * Applied to all calls to the {@code RestrictedSolrClient}. + */ + private final Map fixedParams; + + /** + * Create a {@link HttpSolrClient} wrapped as a {@code RestrictedSolrClient} using the property + * {@link PropertiesLoader#SOLR_PARAMS_MAP} for restrictions and the given {@code collection} as + * {@link #defaultCollection}. + * @param solrBaseURL an URL to a Solr server, sans collection. Example: {@code http://localhost:8983/solr}. + * @param collection the collection to use for {@link #defaultCollection}. Example: {@code netarchivebuilder}. + * @return a {@code SolrClient} where all calls are restricted aka "safe". + */ + public static RestrictedSolrClient createSolrClient(String solrBaseURL, String collection) { + return new RestrictedSolrClient(new HttpSolrClient.Builder(solrBaseURL).build(), collection); + } + + /** + * Construct a restricting {@code SolrClient} where the {@link #fixedParams} are taken from + * {@link PropertiesLoader#SOLR_PARAMS_MAP}. + *

+ * Important: As the {@code defaultCollection} is explicit, the given {@code inner} {@code SolrClient} + * must not specify the collection name in its URL. A valid URL would be {@code http://localhost:8983/solr}. + * @param inner a SolrClient set to a Solr instance without a collection name specified. + * @param defaultCollection the collection to query is no explicit collection is given in the + * {@code SolrClient} calls. + */ + public RestrictedSolrClient(SolrClient inner, String defaultCollection) { + this(inner, defaultCollection, PropertiesLoader.SOLR_PARAMS_MAP); + } + + /** + * Construct a restricting {@code SolrClient}. + *

+ * Important: As the {@code defaultCollection} is explicit, the given {@code inner} {@code SolrClient} + * must not specify the collection name in its URL. A valid URL would be {@code http://localhost:8983/solr}. + * @param inner a SolrClient set to a Solr instance without a collection name specified. + * @param defaultCollection the collection to query is no explicit collection is given in the + * {@code SolrClient} calls. + * @param fixedParams the fixed parameters to apply to all calls to the {@code SolrClient}. + */ + public RestrictedSolrClient(SolrClient inner, String defaultCollection, Map fixedParams) { + this.inner = inner; + this.defaultCollection = defaultCollection; + this.fixedParams = fixedParams; + } + + /** + * Applies the {@link #fixedParams} on the given {@code params}, with the adjustment that existing + * filter queries are extended instead of being replaced. + * @param params parameters intended for searching. + * @return restricted parameters, ready for search. + */ + private SolrParams restrict(SolrParams params) { + if (fixedParams == null || fixedParams.isEmpty()) { + return params; + } + ModifiableSolrParams restricted = new ModifiableSolrParams(params); + fixedParams.forEach((key, value) -> { + if (CommonParams.FL.equals(key)) { + restricted.add(key, value); + } else { + restricted.set(key, value); + } + }); + return restricted; + } + + /** + * Non-implemented {@link SolrRequest} version of {@link #restrict(SolrParams)}. + */ + private SolrRequest restrict(SolrRequest request) { + log.error("constrict(SolrRequest) called, but is not implemented yet, " + + "as it was alledgedly not used in SolrWayback"); + throw new UnsupportedOperationException("Restriction of SolrRequests not supported yet"); + } + + /* Delegates below where constrict and defaultCollection are applied when possible */ + + @Override + public UpdateResponse add(String collection, Collection docs) throws SolrServerException, IOException { + return inner.add(collection, docs); + } + + @Override + public UpdateResponse add(Collection docs) throws SolrServerException, IOException { + return inner.add(defaultCollection, docs); + } + + @Override + public UpdateResponse add(String collection, Collection docs, int commitWithinMs) throws SolrServerException, IOException { + return inner.add(collection, docs, commitWithinMs); + } + + @Override + public UpdateResponse add(Collection docs, int commitWithinMs) throws SolrServerException, IOException { + return inner.add(defaultCollection, docs, commitWithinMs); + } + + @Override + public UpdateResponse add(String collection, SolrInputDocument doc) throws SolrServerException, IOException { + return inner.add(collection, doc); + } + + @Override + public UpdateResponse add(SolrInputDocument doc) throws SolrServerException, IOException { + return inner.add(defaultCollection, doc); + } + + @Override + public UpdateResponse add(String collection, SolrInputDocument doc, int commitWithinMs) throws SolrServerException, IOException { + return inner.add(collection, doc, commitWithinMs); + } + + @Override + public UpdateResponse add(SolrInputDocument doc, int commitWithinMs) throws SolrServerException, IOException { + return inner.add(defaultCollection, doc, commitWithinMs); + } + + @Override + public UpdateResponse add(String collection, Iterator docIterator) throws SolrServerException, IOException { + return inner.add(collection, docIterator); + } + + @Override + public UpdateResponse add(Iterator docIterator) throws SolrServerException, IOException { + return inner.add(defaultCollection, docIterator); + } + + @Override + public UpdateResponse addBean(String collection, Object obj) throws IOException, SolrServerException { + return inner.addBean(collection, obj); + } + + @Override + public UpdateResponse addBean(Object obj) throws IOException, SolrServerException { + return inner.addBean(defaultCollection, obj); + } + + @Override + public UpdateResponse addBean(String collection, Object obj, int commitWithinMs) throws IOException, SolrServerException { + return inner.addBean(collection, obj, commitWithinMs); + } + + @Override + public UpdateResponse addBean(Object obj, int commitWithinMs) throws IOException, SolrServerException { + return inner.addBean(defaultCollection, obj, commitWithinMs); + } + + @Override + public UpdateResponse addBeans(String collection, Collection beans) throws SolrServerException, IOException { + return inner.addBeans(collection, beans); + } + + @Override + public UpdateResponse addBeans(Collection beans) throws SolrServerException, IOException { + return inner.addBeans(defaultCollection, beans); + } + + @Override + public UpdateResponse addBeans(String collection, Collection beans, int commitWithinMs) throws SolrServerException, IOException { + return inner.addBeans(collection, beans, commitWithinMs); + } + + @Override + public UpdateResponse addBeans(Collection beans, int commitWithinMs) throws SolrServerException, IOException { + return inner.addBeans(defaultCollection, beans, commitWithinMs); + } + + @Override + public UpdateResponse addBeans(String collection, Iterator beanIterator) throws SolrServerException, IOException { + return inner.addBeans(collection, beanIterator); + } + + @Override + public UpdateResponse addBeans(Iterator beanIterator) throws SolrServerException, IOException { + return inner.addBeans(defaultCollection, beanIterator); + } + + @Override + public UpdateResponse commit(String collection) throws SolrServerException, IOException { + return inner.commit(collection); + } + + @Override + public UpdateResponse commit() throws SolrServerException, IOException { + return inner.commit(defaultCollection); + } + + @Override + public UpdateResponse commit(String collection, boolean waitFlush, boolean waitSearcher) throws SolrServerException, IOException { + return inner.commit(collection, waitFlush, waitSearcher); + } + + @Override + public UpdateResponse commit(boolean waitFlush, boolean waitSearcher) throws SolrServerException, IOException { + return inner.commit(defaultCollection, waitFlush, waitSearcher); + } + + @Override + public UpdateResponse commit(String collection, boolean waitFlush, boolean waitSearcher, boolean softCommit) throws SolrServerException, IOException { + return inner.commit(collection, waitFlush, waitSearcher, softCommit); + } + + @Override + public UpdateResponse commit(boolean waitFlush, boolean waitSearcher, boolean softCommit) throws SolrServerException, IOException { + return inner.commit(defaultCollection, waitFlush, waitSearcher, softCommit); + } + + @Override + public UpdateResponse optimize(String collection) throws SolrServerException, IOException { + return inner.optimize(collection); + } + + @Override + public UpdateResponse optimize() throws SolrServerException, IOException { + return inner.optimize(defaultCollection); + } + + @Override + public UpdateResponse optimize(String collection, boolean waitFlush, boolean waitSearcher) throws SolrServerException, IOException { + return inner.optimize(collection, waitFlush, waitSearcher); + } + + @Override + public UpdateResponse optimize(boolean waitFlush, boolean waitSearcher) throws SolrServerException, IOException { + return inner.optimize(defaultCollection, waitFlush, waitSearcher); + } + + @Override + public UpdateResponse optimize(String collection, boolean waitFlush, boolean waitSearcher, int maxSegments) throws SolrServerException, IOException { + return inner.optimize(collection, waitFlush, waitSearcher, maxSegments); + } + + @Override + public UpdateResponse optimize(boolean waitFlush, boolean waitSearcher, int maxSegments) throws SolrServerException, IOException { + return inner.optimize(defaultCollection, waitFlush, waitSearcher, maxSegments); + } + + @Override + public UpdateResponse rollback(String collection) throws SolrServerException, IOException { + return inner.rollback(collection); + } + + @Override + public UpdateResponse rollback() throws SolrServerException, IOException { + return inner.rollback(defaultCollection); + } + + @Override + public UpdateResponse deleteById(String collection, String id) throws SolrServerException, IOException { + return inner.deleteById(collection, id); + } + + @Override + public UpdateResponse deleteById(String id) throws SolrServerException, IOException { + return inner.deleteById(defaultCollection, id); + } + + @Override + public UpdateResponse deleteById(String collection, String id, int commitWithinMs) throws SolrServerException, IOException { + return inner.deleteById(collection, id, commitWithinMs); + } + + @Override + public UpdateResponse deleteById(String id, int commitWithinMs) throws SolrServerException, IOException { + return inner.deleteById(defaultCollection, id, commitWithinMs); + } + + @Override + public UpdateResponse deleteById(String collection, List ids) throws SolrServerException, IOException { + return inner.deleteById(collection, ids); + } + + @Override + public UpdateResponse deleteById(List ids) throws SolrServerException, IOException { + return inner.deleteById(defaultCollection, ids); + } + + @Override + public UpdateResponse deleteById(String collection, List ids, int commitWithinMs) throws SolrServerException, IOException { + return inner.deleteById(collection, ids, commitWithinMs); + } + + @Override + public UpdateResponse deleteById(List ids, int commitWithinMs) throws SolrServerException, IOException { + return inner.deleteById(defaultCollection, ids, commitWithinMs); + } + + @Override + public UpdateResponse deleteByQuery(String collection, String query) throws SolrServerException, IOException { + return inner.deleteByQuery(collection, query); + } + + @Override + public UpdateResponse deleteByQuery(String query) throws SolrServerException, IOException { + return inner.deleteByQuery(defaultCollection, query); + } + + @Override + public UpdateResponse deleteByQuery(String collection, String query, int commitWithinMs) throws SolrServerException, IOException { + return inner.deleteByQuery(collection, query, commitWithinMs); + } + + @Override + public UpdateResponse deleteByQuery(String query, int commitWithinMs) throws SolrServerException, IOException { + return inner.deleteByQuery(defaultCollection, query, commitWithinMs); + } + + @Override + public SolrPingResponse ping() throws SolrServerException, IOException { + return inner.ping(); + } + + @Override + public QueryResponse query(String collection, SolrParams params) throws SolrServerException, IOException { + return inner.query(collection, restrict(params)); + } + + @Override + public QueryResponse query(SolrParams params) throws SolrServerException, IOException { + return inner.query(defaultCollection, restrict(params)); + } + + @Override + public QueryResponse query(String collection, SolrParams params, SolrRequest.METHOD method) throws SolrServerException, IOException { + return inner.query(collection, params, method); + } + + @Override + public QueryResponse query(SolrParams params, SolrRequest.METHOD method) throws SolrServerException, IOException { + return inner.query(defaultCollection, restrict(params), method); + } + + @Override + public QueryResponse queryAndStreamResponse(String collection, SolrParams params, StreamingResponseCallback callback) throws SolrServerException, IOException { + return inner.queryAndStreamResponse(collection, restrict(params), callback); + } + + @Override + public QueryResponse queryAndStreamResponse(SolrParams params, StreamingResponseCallback callback) throws SolrServerException, IOException { + return inner.queryAndStreamResponse(defaultCollection, restrict(params), callback); + } + + @Override + public SolrDocument getById(String collection, String id) throws SolrServerException, IOException { + return inner.getById(collection, id); + } + + @Override + public SolrDocument getById(String id) throws SolrServerException, IOException { + return inner.getById(defaultCollection, id); + } + + @Override + public SolrDocument getById(String collection, String id, SolrParams params) throws SolrServerException, IOException { + return inner.getById(collection, id, restrict(params)); + } + + @Override + public SolrDocument getById(String id, SolrParams params) throws SolrServerException, IOException { + return inner.getById(defaultCollection, id, restrict(params)); + } + + @Override + public SolrDocumentList getById(String collection, Collection ids) throws SolrServerException, IOException { + // TODO: Should the default parameters be used with ID lookups? + return inner.getById(collection, ids); + } + + @Override + public SolrDocumentList getById(Collection ids) throws SolrServerException, IOException { + return inner.getById(defaultCollection, ids); + } + + @Override + public SolrDocumentList getById(String collection, Collection ids, SolrParams params) throws SolrServerException, IOException { + return inner.getById(collection, ids, restrict(params)); + } + + @Override + public SolrDocumentList getById(Collection ids, SolrParams params) throws SolrServerException, IOException { + return inner.getById(defaultCollection, ids, restrict(params)); + } + + @Override + public NamedList request(SolrRequest request, String collection) throws SolrServerException, IOException { + return inner.request(restrict(request), collection); + } + + @Override + public DocumentObjectBinder getBinder() { + return inner.getBinder(); + } + + @Override + public void close() throws IOException { + inner.close(); + } +} From ac603551fcdbbc0d5008730f8c1d2ec0a8a1c7d6 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Sat, 14 Oct 2023 21:34:59 +0200 Subject: [PATCH 02/10] Simplify creation of RestrictedSolrClient --- .../solr/NetarchiveSolrClient.java | 8 +++---- .../solr/RestrictedSolrClient.java | 24 +++++++++++++++++++ 2 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java index eea9054c..fa9c827f 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java @@ -74,7 +74,7 @@ protected NetarchiveSolrClient() { // private. Singleton * */ public static void initialize(String solrServerUrl) { - SolrClient innerSolrClient = new HttpSolrClient.Builder(solrServerUrl).build(); + SolrClient innerSolrClient = RestrictedSolrClient.createSolrClient(); if (PropertiesLoader.SOLR_SERVER_CACHING) { int maxCachingEntries = PropertiesLoader.SOLR_SERVER_CACHING_MAX_ENTRIES; @@ -82,12 +82,12 @@ public static void initialize(String solrServerUrl) { solrServer = new CachingSolrClient(innerSolrClient, maxCachingEntries, maxCachingSeconds, -1); //-1 means no maximum number of connections log.info("SolrClient initialized with caching properties: maxCachedEntrie="+maxCachingEntries +" cacheAgeSeconds="+maxCachingSeconds); } else { - solrServer = new HttpSolrClient.Builder(solrServerUrl).build(); - log.info("SolClient initialized without caching"); + solrServer = innerSolrClient; + log.info("SolrClient initialized without caching"); } // some of the solr query will never using cache. word cloud(cache memory) + playback resolving etc. (cache poisoning) - noCacheSolrServer = new HttpSolrClient.Builder(solrServerUrl).build(); + noCacheSolrServer = innerSolrClient; // solrServer.setRequestWriter(new BinaryRequestWriter()); // To avoid http // error code 413/414, due to monster URI. (and it is faster) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java index 6ef0bd91..61dc4470 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java @@ -38,7 +38,10 @@ import java.util.Collection; import java.util.Iterator; import java.util.List; +import java.util.Locale; import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; /** * SolrClient wrapper that ensures that {@link PropertiesLoader#SOLR_PARAMS_MAP} @@ -67,6 +70,26 @@ public class RestrictedSolrClient extends SolrClient { */ private final Map fixedParams; + /** + * Create a {@link HttpSolrClient} wrapped as a {@code RestrictedSolrClient} using the property + * {@link PropertiesLoader#SOLR_PARAMS_MAP} for restrictions and the given {@code collection} as + * {@link #defaultCollection}. + *

+ * The {@code solrBaseURL} and {@code collection} are parsed from the combined version + * {@link PropertiesLoader#SOLR_SERVER}. + * @return a {@code SolrClient} where all calls are restricted aka "safe". + */ + public static RestrictedSolrClient createSolrClient() { + Matcher m = SOLR_COLLECTION_PATTERN.matcher(PropertiesLoader.SOLR_SERVER); + if (!m.matches()) { + throw new IllegalStateException(String.format( + Locale.ROOT, "Unable to match Solr and collection from '%s' using pattern '%s'", + PropertiesLoader.SOLR_SERVER, SOLR_COLLECTION_PATTERN.pattern())); + } + return createSolrClient(m.group(1), m.group(2)); + } + private static final Pattern SOLR_COLLECTION_PATTERN = Pattern.compile("(http.*)/([^/]+)/?$"); + /** * Create a {@link HttpSolrClient} wrapped as a {@code RestrictedSolrClient} using the property * {@link PropertiesLoader#SOLR_PARAMS_MAP} for restrictions and the given {@code collection} as @@ -76,6 +99,7 @@ public class RestrictedSolrClient extends SolrClient { * @return a {@code SolrClient} where all calls are restricted aka "safe". */ public static RestrictedSolrClient createSolrClient(String solrBaseURL, String collection) { + log.info("Creating RestrictedSolrClient(solrBaseURL='{}', collection='{}')", solrBaseURL, collection); return new RestrictedSolrClient(new HttpSolrClient.Builder(solrBaseURL).build(), collection); } From 7e9c71c08746db522e69822074feb4fb14826dfa Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Sun, 15 Oct 2023 13:13:54 +0200 Subject: [PATCH 03/10] Minor: deepCopy of SolrQuery relied on assumption of fresh Entry creation for iteration --- .../solrwayback/solr/RestrictedSolrClient.java | 7 ++++--- .../solrwayback/solr/SolrStreamingWarcExportClient.java | 4 +--- .../dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java | 7 ++++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java index 61dc4470..b9febfee 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java @@ -52,7 +52,7 @@ * Important: As the {@link #defaultCollection} is explicit, the given {@code inner} {@code SolrClient}s * must not specify the collection name in its URL. A valid URL would be {@code http://localhost:8983/solr}. *

- * Recommended: + * Recommended: Use the convenience method {@link #createSolrClient()} for all Solr client creation. */ public class RestrictedSolrClient extends SolrClient { private static final Logger log = LoggerFactory.getLogger(RestrictedSolrClient.class); @@ -157,13 +157,14 @@ private SolrParams restrict(SolrParams params) { /** * Non-implemented {@link SolrRequest} version of {@link #restrict(SolrParams)}. */ + @SuppressWarnings("rawtypes") private SolrRequest restrict(SolrRequest request) { - log.error("constrict(SolrRequest) called, but is not implemented yet, " + + log.error("restrict(SolrRequest) called, but is not implemented yet, " + "as it was alledgedly not used in SolrWayback"); throw new UnsupportedOperationException("Restriction of SolrRequests not supported yet"); } - /* Delegates below where constrict and defaultCollection are applied when possible */ + /* Delegates below where restrict(...) and defaultCollection are applied when possible */ @Override public UpdateResponse add(String collection, Collection docs) throws SolrServerException, IOException { diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrStreamingWarcExportClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrStreamingWarcExportClient.java index cbc7b4c7..3d064cbb 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrStreamingWarcExportClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrStreamingWarcExportClient.java @@ -3,8 +3,6 @@ import org.apache.solr.client.solrj.SolrClient; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.client.solrj.SolrRequest.METHOD; -import org.apache.solr.client.solrj.impl.BinaryRequestWriter; -import org.apache.solr.client.solrj.impl.HttpSolrClient; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.common.SolrDocumentList; import org.apache.solr.common.params.CursorMarkParams; @@ -23,7 +21,7 @@ public class SolrStreamingWarcExportClient { public SolrStreamingWarcExportClient(String solrServerUrl){ - solrServer = new HttpSolrClient.Builder(solrServerUrl).build(); + solrServer = RestrictedSolrClient.createSolrClient(); //solrServer.setRequestWriter(new BinaryRequestWriter()); cursorMark = CursorMarkParams.CURSOR_MARK_START; //Reset to start again } diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java index f8a357a2..93741568 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java @@ -6,6 +6,7 @@ import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDoc; import dk.kb.netarchivesuite.solrwayback.service.dto.IndexDocShort; import dk.kb.netarchivesuite.solrwayback.solr.NetarchiveSolrClient; +import org.apache.commons.lang3.tuple.Pair; import org.apache.solr.client.solrj.SolrQuery; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrDocumentList; @@ -310,9 +311,9 @@ public static ArcEntryDescriptor indexDoc2ArcEntryDescriptor(IndexDoc indexDoc) */ public static SolrQuery deepCopy(SolrQuery solrQuery) { SolrQuery qc = new SolrQuery(); - solrQuery.getMap().entrySet().stream(). - peek(entry -> entry.setValue(Arrays.copyOf(entry.getValue(), entry.getValue().length))). - forEach(entry -> qc.set(entry.getKey(), entry.getValue())); + solrQuery.getMap().entrySet().stream() + .map(entry -> Pair.of(entry.getKey(), Arrays.copyOf(entry.getValue(), entry.getValue().length))) + .forEach(entry -> qc.set(entry.getKey(), entry.getValue())); return qc; } From 60317cff18ed1b8bdc5e645bb04b01cd2a778990 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Sun, 15 Oct 2023 14:22:52 +0200 Subject: [PATCH 04/10] Remove SolrUtils.setSolrParams as the functionality is now covered by RestrictedSolrClient --- .../solr/NetarchiveSolrClient.java | 92 ++++++------------- .../solr/SolrGenericStreaming.java | 3 +- .../solrwayback/util/SolrUtils.java | 14 --- 3 files changed, 30 insertions(+), 79 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java index fa9c827f..22289852 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java @@ -169,7 +169,6 @@ public List getDomainFacetsIngoing(String domain, int facetLimit, Da solrQuery.addFilterQuery("crawl_date:[" + dateStart + " TO " + dateEnd + "]"); solrQuery.add("fl","id"); - SolrUtils.setSolrParams(solrQuery); QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); List facetList = new ArrayList(); FacetField facet = rsp.getFacetField("domain"); @@ -199,8 +198,7 @@ public List getDomainFacetsOutgoing(String domain, int facetLimit, D solrQuery.add("facet.limit", "" + (facetLimit + 1)); // +1 because itself will be removed and is almost certain of resultset is self-linking solrQuery.addFilterQuery("crawl_date:[" + dateStart + " TO " + dateEnd + "]"); solrQuery.add("fl","id"); // request - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache + QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache List facetList = new ArrayList(); FacetField facet = rsp.getFacetField("links_domains"); @@ -285,8 +283,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics(statsField); long call1ns = -System.nanoTime(); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); call1ns += System.nanoTime(); final long call1nsSolr = rsp.getQTime(); @@ -310,8 +307,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics(statsField); long call2ns = -System.nanoTime(); - SolrUtils.setSolrParams(solrQuery); - rsp = solrServer.query(solrQuery, METHOD.POST); + rsp = solrServer.query(solrQuery, METHOD.POST); call2ns += System.nanoTime(); final long call2nsSolr = rsp.getQTime(); @@ -341,8 +337,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics(statsField); callDomain = -System.nanoTime(); - SolrUtils.setSolrParams(solrQuery); - rsp = solrServer.query(solrQuery, METHOD.POST); + rsp = solrServer.query(solrQuery, METHOD.POST); callDomain += System.nanoTime(); callDomainSolr = rsp.getQTime(); if (rsp.getResults().size() == 0) { @@ -357,8 +352,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics("content_length"); long call3ns = -System.nanoTime(); - SolrUtils.setSolrParams(solrQuery); - rsp = solrServer.query(solrQuery, METHOD.POST); + rsp = solrServer.query(solrQuery, METHOD.POST); call3ns += System.nanoTime(); final long call3nsSolr = rsp.getQTime(); @@ -594,8 +588,7 @@ public SolrDocument resolveURLLenient(List fields, String url, String... solrQuery.set(HighlightParams.HIGHLIGHT, false); solrQuery.set(FacetParams.FACET, false); solrQuery.set(GroupParams.GROUP, false); - SolrUtils.setSolrParams(solrQuery); - QueryResponse response; + QueryResponse response; try { lenientAttempts.incrementAndGet(); response = noCacheSolrServer.query(solrQuery); @@ -617,8 +610,7 @@ public SolrDocument resolveURLLenient(List fields, String url, String... public ArcEntryDescriptor findVideo(String videoQueryString) throws Exception { SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery(videoQueryString); - SolrUtils.setSolrParams(solrQuery); - solrQuery.setRows(1); // Just get one result + solrQuery.setRows(1); // Just get one result solrQuery.set("facet", "false"); // Very important. Must overwrite to false. Facets are very slow and expensive. solrQuery.add("fq", "content_type_norm:video"); // only videos @@ -652,8 +644,7 @@ public ArrayList getHarvestTimesForUrl(String url) throws Exception { solrQuery.set("facet", "false"); // very important. Must overwrite to false. Facets are very slow and expensive. solrQuery.add("fl", "id,crawl_date"); solrQuery.setRows(1000000); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = loggedSolrQuery("getHarvestTimeForUrl", solrQuery); + QueryResponse rsp = loggedSolrQuery("getHarvestTimeForUrl", solrQuery); SolrDocumentList docs = rsp.getResults(); @@ -674,8 +665,7 @@ public long countResults(String query, String... filterQueries) throws SolrServe solrQuery.add("fl", "id"); solrQuery.setFilterQueries(filterQueries); solrQuery.setRows(0); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); return rsp.getResults().getNumFound(); } @@ -692,8 +682,7 @@ public String getConcatedTextFromHtmlForQuery(String query,String filterQuery) t solrQuery.setRows(5000); long solrNS = -System.nanoTime(); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache + QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache solrNS += System.nanoTime(); SolrDocumentList docs = rsp.getResults(); @@ -722,8 +711,7 @@ public ArrayList getHarvestPreviewsForUrl(int year,String url) throws solrQuery.setRows(1000000); QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); - SolrUtils.setSolrParams(solrQuery); - SolrDocumentList docs = rsp.getResults(); + SolrDocumentList docs = rsp.getResults(); ArrayList indexDocs = SolrUtils.solrDocList2IndexDoc(docs); return indexDocs; @@ -742,8 +730,7 @@ public ArrayList getPagePreviewsYearInfo(String url) throws Exceptio solrQuery.add("facet.limit", "100"); //All years... solrQuery.add("fl","id"); solrQuery.setRows(0); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); ArrayList facetList = new ArrayList(); FacetField facet = rsp.getFacetField("crawl_year"); for (Count c : facet.getValues()) { @@ -776,8 +763,7 @@ public IndexDoc getArcEntry(String source_file_path, long offset) throws Excepti solrQuery.setRows(1); // QueryResponse rsp = loggedSolrQuery("getArchEntry", solrQuery); //Timing disabled due to spam. Also only took 1-5 millis - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); SolrDocumentList docs = rsp.getResults(); if (docs.getNumFound() == 0) { @@ -833,8 +819,6 @@ public ArrayList imagesLocationSearchWithSort(String searchText, Strin solrQuery.setRows(results); - SolrUtils.setSolrParams(solrQuery); //NOT SURE ABOUT THIS ONE! - // The 3 lines defines geospatial search. The ( ) are required if you want to // AND with another query solrQuery.setQuery("({!geofilt sfield=exif_location}) AND " + searchText); @@ -866,8 +850,7 @@ public SearchResult search(String searchString, String filterQuery, int results) } - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = loggedSolrQuery("search", solrQuery); + QueryResponse rsp = loggedSolrQuery("search", solrQuery); SolrDocumentList docs = rsp.getResults(); result.setNumberOfResults(docs.getNumFound()); @@ -879,8 +862,7 @@ public SearchResult search(String searchString, String filterQuery, int results) public long numberOfDocuments() throws Exception { SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery("*:*"); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); SolrDocumentList docs = rsp.getResults(); return docs.getNumFound(); } @@ -1165,8 +1147,7 @@ public IndexDoc findClosestHarvestTimeForUrl(String url, String timeStamp) throw // other methods in this class, but not as critical there. // Hoping for a solr fix.... solrQuery.setRows(10); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = loggedSolrQuery( + QueryResponse rsp = loggedSolrQuery( String.format("findClosestHarvestTimeForUrl(url='%s', timestamp=%s)", url.length() > 50 ? url.substring(0, 50) + "..." : url, timeStamp), solrQuery); @@ -1251,8 +1232,7 @@ public SolrQuery buildSolrQueryForPeriod(String query, String startDate, String public Long countTextHtmlForPeriod(String query, String startDate, String endDate) throws Exception { SolrQuery solrQuery = buildSolrQueryForPeriod(query, startDate, endDate); solrQuery.add("fq", SolrUtils.NO_REVISIT_FILTER); // do not include record_type:revisit - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); return rsp.getResults().getNumFound(); } @@ -1261,8 +1241,7 @@ public Long countTagHtmlForPeriod(String query, String startDate, String endDate throw new InvalidArgumentServiceException("Tag syntax not accepted:" + query); } SolrQuery solrQuery = buildSolrQueryForPeriod("elements_used:\"" + query + "\"", startDate, endDate); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); return rsp.getResults().getNumFound(); } @@ -1282,8 +1261,7 @@ public HashMap getYearHtmlFacets(String query) throws Exception { solrQuery.set("facet.field", "crawl_year"); solrQuery.set("facet.sort", "index"); solrQuery.set("facet.limit", "500"); // 500 is higher than number of different years - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); FacetField facetField = rsp.getFacetField("crawl_year"); @@ -1304,8 +1282,7 @@ public IndexDoc findExactMatchPWID(String url, String utc) throws Exception { solrQuery.setRows(1); // 1 page only solrQuery.add("fl", SolrUtils.indexDocFieldList); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = loggedSolrQuery("pwidQuery", solrQuery); + QueryResponse rsp = loggedSolrQuery("pwidQuery", solrQuery); SolrDocumentList docs = rsp.getResults(); if (docs.size() == 0) { @@ -1331,8 +1308,7 @@ public HashMap getYearFacetsHtmlAll() throws Exception { solrQuery.add("fq","content_type_norm:html"); // only html pages solrQuery.add("fq", SolrUtils.NO_REVISIT_FILTER); // do not include record_type:revisit - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); FacetField facetField = rsp.getFacetField("crawl_year"); @@ -1364,8 +1340,7 @@ public HashMap getYearTextHtmlFacets(String query) throws Excepti solrQuery.add("fq","content_type_norm:html"); // only html pages solrQuery.add("fq", SolrUtils.NO_REVISIT_FILTER); // do not include record_type:revisit - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); FacetField facetField = rsp.getFacetField("crawl_year"); @@ -1388,8 +1363,7 @@ public ArrayList findNearestForResourceNameAndDomain(String domain, St solrQuery.set("group.sort", "abs(sub(ms(" + timeStamp + "), crawl_date)) asc"); solrQuery.add("fl", SolrUtils.indexDocFieldList); solrQuery.setFilterQueries(SolrUtils.NO_REVISIT_FILTER); // No binary for revists. - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); SolrDocumentList docs = groupsToDoc(rsp); return SolrUtils.solrDocList2IndexDoc(docs); } @@ -1426,8 +1400,7 @@ public String searchJsonResponseOnlyFacets(String query, List fq, boolea } } - SolrUtils.setSolrParams(solrQuery); - + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); @@ -1473,8 +1446,7 @@ public String searchJsonResponseOnlyFacetsLoadMore( String query, List f } - SolrUtils.setSolrParams(solrQuery); - + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); @@ -1530,8 +1502,7 @@ public String searchJsonResponseNoFacets(String query, List fq, boolean } } - SolrUtils.setSolrParams(solrQuery); - + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); @@ -1565,8 +1536,7 @@ public String idLookupResponse(String id, String fieldList) throws Exception { QueryRequest req = new QueryRequest(solrQuery); req.setResponseParser(rawJsonResponseParser); - SolrUtils.setSolrParams(solrQuery); - NamedList resp = solrServer.request(req); + NamedList resp = solrServer.request(req); String jsonResponse = (String) resp.get("response"); return jsonResponse; } @@ -1599,8 +1569,7 @@ public DomainStatistics domainStatistics(String domain, String startDate, String solrQuery.add("stats", "true"); solrQuery.add("stats.field", "{!count=true cardinality=true}url_norm"); // Important, use cardinality and not unique. solrQuery.add("stats.field", "{!sum=true}content_length"); - SolrUtils.setSolrParams(solrQuery); - QueryResponse rsp = solrServer.query(solrQuery); + QueryResponse rsp = solrServer.query(solrQuery); Map statsMap = rsp.getFieldStatsInfo(); FieldStatsInfo statsUrl_norm = statsMap.get("url_norm"); @@ -1653,7 +1622,6 @@ public String domainStatisticsForQuery(String query, List fq) throws Exc for (String filter : fq) { solrQuery.addFilterQuery(filter); } - SolrUtils.setSolrParams(solrQuery); //TODO not sure about this one NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); @@ -1690,7 +1658,6 @@ public String domainStatisticsForQuery(String query, List fq, String sta for (String filter : fq) { solrQuery.addFilterQuery(filter); } - SolrUtils.setSolrParams(solrQuery); //TODO not sure about this one NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); @@ -1771,8 +1738,7 @@ private QueryResponse loggedSolrQuery(String caller, SolrQuery solrQuery) throws * @return the result of issuing the query. */ public static QueryResponse query(SolrQuery solrQuery, boolean useCachingClient) { - SolrUtils.setSolrParams(solrQuery); - try { + try { return useCachingClient ? solrServer.query(solrQuery, METHOD.POST) : noCacheSolrServer.query(solrQuery, METHOD.POST); diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrGenericStreaming.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrGenericStreaming.java index 12548aee..7253f0f6 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrGenericStreaming.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/SolrGenericStreaming.java @@ -241,8 +241,7 @@ public static void adjustSolrQuery(SolrQuery solrQuery, } // Properties defined parameters - SolrUtils.setSolrParams(solrQuery); - + // Set default values if not already set solrQuery.set(CommonParams.FL, solrQuery.get(CommonParams.FL, "source_file_path,source_file_offset")); diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java index 93741568..ba7b3543 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/util/SolrUtils.java @@ -317,20 +317,6 @@ public static SolrQuery deepCopy(SolrQuery solrQuery) { return qc; } - /** - * Sets properties-defined parameters. - * This should be called with ALL SolrQuery instances before issuing the query. - * - * The semantics of whether it should be called before or after setting method specific parameters is unclear. - * @param solrQuery a Solr query - */ - public static void setSolrParams(SolrQuery solrQuery) { - HashMap SOLR_PARAMS_MAP = PropertiesLoader.SOLR_PARAMS_MAP; - for (String key : SOLR_PARAMS_MAP.keySet()) { - solrQuery.set(key,SOLR_PARAMS_MAP.get(key)); - } - } - /** * Quotes the given phrase and escapes characters that needs escaping (backslash and quote). * {@code foo \bar "zoo} becomes {@code "foo \\bar \"zoo"}. From 785e58033f08db815d336b9264d12d788b080ab3 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Mon, 16 Oct 2023 11:53:36 +0200 Subject: [PATCH 05/10] formatting: Fix indent and streaming expressions --- .../solr/NetarchiveSolrClient.java | 312 +++++++++--------- 1 file changed, 156 insertions(+), 156 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java index 22289852..847f218a 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/NetarchiveSolrClient.java @@ -198,7 +198,7 @@ public List getDomainFacetsOutgoing(String domain, int facetLimit, D solrQuery.add("facet.limit", "" + (facetLimit + 1)); // +1 because itself will be removed and is almost certain of resultset is self-linking solrQuery.addFilterQuery("crawl_date:[" + dateStart + " TO " + dateEnd + "]"); solrQuery.add("fl","id"); // request - QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache + QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache List facetList = new ArrayList(); FacetField facet = rsp.getFacetField("links_domains"); @@ -251,7 +251,7 @@ public String getRawSolrQuery(String query,List fq,String fieldList, int return jsonResponse; } */ - + /* * The logic for getting the 4 dates in 2 queries is too complicated, and only * gives small performance boost... @@ -283,7 +283,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics(statsField); long call1ns = -System.nanoTime(); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); call1ns += System.nanoTime(); final long call1nsSolr = rsp.getQTime(); @@ -307,7 +307,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics(statsField); long call2ns = -System.nanoTime(); - rsp = solrServer.query(solrQuery, METHOD.POST); + rsp = solrServer.query(solrQuery, METHOD.POST); call2ns += System.nanoTime(); final long call2nsSolr = rsp.getQTime(); @@ -337,7 +337,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics(statsField); callDomain = -System.nanoTime(); - rsp = solrServer.query(solrQuery, METHOD.POST); + rsp = solrServer.query(solrQuery, METHOD.POST); callDomain += System.nanoTime(); callDomainSolr = rsp.getQTime(); if (rsp.getResults().size() == 0) { @@ -352,7 +352,7 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String solrQuery.setGetFieldStatistics("content_length"); long call3ns = -System.nanoTime(); - rsp = solrServer.query(solrQuery, METHOD.POST); + rsp = solrServer.query(solrQuery, METHOD.POST); call3ns += System.nanoTime(); final long call3nsSolr = rsp.getQTime(); @@ -384,18 +384,18 @@ public WaybackStatistics getWayBackStatistics(int statusCode, String url, String */ public ArrayList findImagesForTimestamp(String searchString, String timeStamp) { return SolrGenericStreaming.create( - SRequest.builder(). - query(searchString). - filterQueries("content_type_norm:image", // only images - SolrUtils.NO_REVISIT_FILTER, // No binary for revisits. - "image_size:[2000 TO *]"). // No small images. (fillers etc.) - fields(SolrUtils.indexDocFieldList). - timeProximityDeduplication(timeStamp, "url_norm"). - maxResults(50) // TODO: Make this an argument instead + SRequest.builder() + .query(searchString) + .filterQueries("content_type_norm:image", // only images + SolrUtils.NO_REVISIT_FILTER, // No binary for revisits. + "image_size:[2000 TO *]") // No small images. (fillers etc.) + .fields(SolrUtils.indexDocFieldList) + .timeProximityDeduplication(timeStamp, "url_norm") + .maxResults(50) // TODO: Make this an argument instead ). - stream(). - map(SolrUtils::solrDocument2ArcEntryDescriptor). - collect(Collectors.toCollection(ArrayList::new)); + stream() + .map(SolrUtils::solrDocument2ArcEntryDescriptor) + .collect(Collectors.toCollection(ArrayList::new)); } /** @@ -415,8 +415,8 @@ public ArrayList findImagesForTimestamp(String searchString, */ public Stream searchURLs(List fields, Stream urls, String... filterQueries) { // Handle processing in batches of 1000 for low latency and low memory overhead - return CollectionUtils.splitToStreams(urls, 1000). - flatMap(batch -> searchURLsSingleTake(fields, batch, filterQueries)); + return CollectionUtils.splitToStreams(urls, 1000) + .flatMap(batch -> searchURLsSingleTake(fields, batch, filterQueries)); } /** @@ -463,12 +463,12 @@ private Stream searchURLsSingleTake( Map lenient = resolveURLsLenient(allFields, unresolved, filterQueries); // Merge the results from direct and lenient and enrich with the SolrDocuments with originalURL - return urlPairs.stream(). - map(Pair::first). // originalURL - map(originalURL -> getValueFromMaps(originalURL, direct, lenient)). - filter(Objects::nonNull). - peek(resultPair -> resultPair.second().setField("originalURL", resultPair.first())). - map(Pair::second); + return urlPairs.stream() + .map(Pair::first) // originalURL + .map(originalURL -> getValueFromMaps(originalURL, direct, lenient)) + .filter(Objects::nonNull) + .peek(resultPair -> resultPair.second().setField("originalURL", resultPair.first())) + .map(Pair::second); } /** @@ -501,17 +501,17 @@ private Map resolveURLsDirect( } // Create list of url queries for the normURLs - Stream urlQueries = urlPairs.stream(). - map(Pair::second). - map(normURL -> "url_norm:" + SolrUtils.createPhrase(normURL)); + Stream urlQueries = urlPairs.stream() + .map(Pair::second) + .map(normURL -> "url_norm:" + SolrUtils.createPhrase(normURL)); // Resolve SolrDocuments using direct url_norm search and store them in a Map with url_norm as key - SRequest request = SRequest.builder(). - queries(urlQueries). - queryBatchSize(1000). // Same as partitionSize in splitToStreams - //usePaging(false). // Optimize Solr lookups (no longer needed) - fields(fields). - filterQueries(filterQueries); + SRequest request = SRequest.builder() + .queries(urlQueries) + .queryBatchSize(1000) // Same as partitionSize in splitToStreams + //.usePaging(false) // Optimize Solr lookups (no longer needed) + .fields(fields) + .filterQueries(filterQueries); if (idealTime != null) { request = request.timeProximityDeduplication(idealTime, "url_norm"); } else { @@ -520,13 +520,13 @@ private Map resolveURLsDirect( Map normResolved = request.stream(). collect(Collectors.toMap(value -> Objects.toString(value.getFieldValue("url_norm")), - value -> value)); + value -> value)); // Convert the Map of [url_norm, SolrDocument] to a Map of [originalURL, SolrDocument] - return urlPairs.stream(). - map(urlPair -> new Pair(urlPair.first(), normResolved.get(urlPair.second()))). - filter(urlPair -> Objects.nonNull(urlPair.second())). - collect(Collectors.toMap(Pair::first, Pair::second)); + return urlPairs.stream() + .map(urlPair -> new Pair<>(urlPair.first(), normResolved.get(urlPair.second()))) + .filter(urlPair -> Objects.nonNull(urlPair.second())) + .collect(Collectors.toMap(Pair::first, Pair::second)); } /** @@ -543,9 +543,9 @@ private Map resolveURLsLenient( throw new IllegalStateException("fields does not contain 'url_norm'"); } // Create jobs for unresolved originalURLs that delivers [originalURL, SolrDocument] - Stream>> lenientJobs = urlPairs. - map(Pair::first). // Only the originalURL is relevant when doing lenient resolving - map(originalURL -> () -> new Pair<>( + Stream>> lenientJobs = urlPairs + .map(Pair::first) // Only the originalURL is relevant when doing lenient resolving + .map(originalURL -> () -> new Pair<>( originalURL, resolveURLLenient(fields, originalURL, filterQueries))); @@ -555,20 +555,20 @@ private Map resolveURLsLenient( if (Objects.isNull(jobPair.second())) { log.debug("Unable to lenient resolve '{}'", jobPair.first()); } - }). - filter(jobPair -> Objects.nonNull(jobPair.second())). - peek(jobPair -> { + }) + .filter(jobPair -> Objects.nonNull(jobPair.second())) + .peek(jobPair -> { String originalURL = jobPair.first(); String normURL = Objects.toString(jobPair.second().getFieldValue("url_norm")); if (originalURL.equals(normURL)) { log.debug("Note: Lenient resolved '{}', but the url_norm was equal to the originalURL", - originalURL); + originalURL); } else { log.debug("Lenient resolved '{}' to '{}'", originalURL, normURL); } - } - ). - collect(Collectors.toMap(Pair::first, Pair::second)); + } + ) + .collect(Collectors.toMap(Pair::first, Pair::second)); } /** @@ -588,7 +588,7 @@ public SolrDocument resolveURLLenient(List fields, String url, String... solrQuery.set(HighlightParams.HIGHLIGHT, false); solrQuery.set(FacetParams.FACET, false); solrQuery.set(GroupParams.GROUP, false); - QueryResponse response; + QueryResponse response; try { lenientAttempts.incrementAndGet(); response = noCacheSolrServer.query(solrQuery); @@ -610,7 +610,7 @@ public SolrDocument resolveURLLenient(List fields, String url, String... public ArcEntryDescriptor findVideo(String videoQueryString) throws Exception { SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery(videoQueryString); - solrQuery.setRows(1); // Just get one result + solrQuery.setRows(1); // Just get one result solrQuery.set("facet", "false"); // Very important. Must overwrite to false. Facets are very slow and expensive. solrQuery.add("fq", "content_type_norm:video"); // only videos @@ -637,14 +637,14 @@ public SearchResult search(String searchString, String filterQuery) throws Excep public ArrayList getHarvestTimesForUrl(String url) throws Exception { ArrayList dates = new ArrayList(); - + String query=UrlUtils.fixLegacyNormaliseUrlErrorQuery(url); SolrQuery solrQuery = new SolrQuery(); solrQuery = new SolrQuery(query); solrQuery.set("facet", "false"); // very important. Must overwrite to false. Facets are very slow and expensive. solrQuery.add("fl", "id,crawl_date"); solrQuery.setRows(1000000); - QueryResponse rsp = loggedSolrQuery("getHarvestTimeForUrl", solrQuery); + QueryResponse rsp = loggedSolrQuery("getHarvestTimeForUrl", solrQuery); SolrDocumentList docs = rsp.getResults(); @@ -665,7 +665,7 @@ public long countResults(String query, String... filterQueries) throws SolrServe solrQuery.add("fl", "id"); solrQuery.setFilterQueries(filterQueries); solrQuery.setRows(0); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); return rsp.getResults().getNumFound(); } @@ -682,7 +682,7 @@ public String getConcatedTextFromHtmlForQuery(String query,String filterQuery) t solrQuery.setRows(5000); long solrNS = -System.nanoTime(); - QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache + QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); //do not cache solrNS += System.nanoTime(); SolrDocumentList docs = rsp.getResults(); @@ -711,7 +711,7 @@ public ArrayList getHarvestPreviewsForUrl(int year,String url) throws solrQuery.setRows(1000000); QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); - SolrDocumentList docs = rsp.getResults(); + SolrDocumentList docs = rsp.getResults(); ArrayList indexDocs = SolrUtils.solrDocList2IndexDoc(docs); return indexDocs; @@ -730,7 +730,7 @@ public ArrayList getPagePreviewsYearInfo(String url) throws Exceptio solrQuery.add("facet.limit", "100"); //All years... solrQuery.add("fl","id"); solrQuery.setRows(0); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); ArrayList facetList = new ArrayList(); FacetField facet = rsp.getFacetField("crawl_year"); for (Count c : facet.getValues()) { @@ -763,7 +763,7 @@ public IndexDoc getArcEntry(String source_file_path, long offset) throws Excepti solrQuery.setRows(1); // QueryResponse rsp = loggedSolrQuery("getArchEntry", solrQuery); //Timing disabled due to spam. Also only took 1-5 millis - QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = noCacheSolrServer.query(solrQuery, METHOD.POST); SolrDocumentList docs = rsp.getResults(); if (docs.getNumFound() == 0) { @@ -817,8 +817,8 @@ public ArrayList imagesLocationSearchWithSort(String searchText, Strin solrQuery.add("sort", sort); } solrQuery.setRows(results); - - + + // The 3 lines defines geospatial search. The ( ) are required if you want to // AND with another query solrQuery.setQuery("({!geofilt sfield=exif_location}) AND " + searchText); @@ -848,9 +848,9 @@ public SearchResult search(String searchString, String filterQuery, int results) if (filterQuery != null) { solrQuery.setFilterQueries(filterQuery); } - - - QueryResponse rsp = loggedSolrQuery("search", solrQuery); + + + QueryResponse rsp = loggedSolrQuery("search", solrQuery); SolrDocumentList docs = rsp.getResults(); result.setNumberOfResults(docs.getNumFound()); @@ -862,21 +862,21 @@ public SearchResult search(String searchString, String filterQuery, int results) public long numberOfDocuments() throws Exception { SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery("*:*"); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); SolrDocumentList docs = rsp.getResults(); return docs.getNumFound(); } public ArrayList findNearestHarvestTimeForMultipleUrlsFullFields(Collection urls, String timeStamp) { - return findNearestDocuments(SolrUtils.indexDocFieldList, timeStamp, urls.stream()). - map(SolrUtils::solrDocument2IndexDoc). - collect(Collectors.toCollection(ArrayList::new)); + return findNearestDocuments(SolrUtils.indexDocFieldList, timeStamp, urls.stream()) + .map(SolrUtils::solrDocument2IndexDoc) + .collect(Collectors.toCollection(ArrayList::new)); } public ArrayList findNearestHarvestTimeForMultipleUrlsFewFields(Collection urls, String timeStamp){ - return findNearestDocuments(SolrUtils.indexDocFieldListShort, timeStamp, urls.stream()). - map(SolrUtils::solrDocument2IndexDocShort). - collect(Collectors.toCollection(ArrayList::new)); + return findNearestDocuments(SolrUtils.indexDocFieldListShort, timeStamp, urls.stream()) + .map(SolrUtils::solrDocument2IndexDocShort) + .collect(Collectors.toCollection(ArrayList::new)); } @@ -896,9 +896,9 @@ public ArrayList findNearestUrlsShort(Collection urls, St findNearestDocumentsLenient(SolrUtils.indexDocFieldListShort, timeStamp, urls.stream()) : findNearestDocuments(SolrUtils.indexDocFieldListShort, timeStamp, urls.stream()); - return docs. - map(SolrUtils::solrDocument2IndexDocShort). - collect(Collectors.toCollection(ArrayList::new)); + return docs + .map(SolrUtils::solrDocument2IndexDocShort) + .collect(Collectors.toCollection(ArrayList::new)); } /** @@ -917,19 +917,19 @@ public Stream findNearestDocuments( String fieldList, String timeStamp, Stream urls, String... filterQueries) { final int chunkSize = 1000; - Stream urlQueries = urls. - filter(url -> !url.startsWith("data:")). - map(NetarchiveSolrClient::normalizeUrl). - map(SolrUtils::createPhrase). - map(url -> "url_norm:" + url); + Stream urlQueries = urls + .filter(url -> !url.startsWith("data:")) + .map(NetarchiveSolrClient::normalizeUrl) + .map(SolrUtils::createPhrase) + .map(url -> "url_norm:" + url); return SRequest.builder(). queries(urlQueries). filterQueries(SolrUtils.extend(SolrUtils.NO_REVISIT_FILTER, filterQueries)). // No binary for revists - queryBatchSize(chunkSize). // URL-searches are single-clause queries, so we can use large batches - pageSize(chunkSize). + queryBatchSize(chunkSize). // URL-searches are single-clause queries, so we can use large batches + pageSize(chunkSize). //usePaging(false). // 1 URL = 1 hit as we deduplicate on url_norm (no longer needed) - fields(fieldList). + fields(fieldList). timeProximityDeduplication(timeStamp, "url_norm"). stream(); } @@ -957,8 +957,8 @@ public Stream findNearestDocumentsLenient( String[] extendedFilterQueries = SolrUtils.extend(SolrUtils.NO_REVISIT_FILTER, filterQueries); // Handle processing in batches of 1000 for fast resolving - return CollectionUtils.splitToStreams(urls.filter(url -> !url.startsWith("data:")), chunkSize). - flatMap(batch -> findNearestDocumentLenientSingleTake( + return CollectionUtils.splitToStreams(urls.filter(url -> !url.startsWith("data:")), chunkSize) + .flatMap(batch -> findNearestDocumentLenientSingleTake( fields, idealTime, batch, extendedFilterQueries)); } @@ -969,7 +969,7 @@ public Stream findNearestDocumentsLenient( * This implementation performs a full resolve of all URLs before delivery, which delays the time before first * delivered {@code Solrdocument} and introduces a memory overhead: This method should only be called for a limited * amount of URLs, such as 1000-10,000. -

+

* If a document cannot be resolved using direct matching with {@code url_norm:}, lenient matching is used. * Lenient first locates the {@code url_norm} closest to the original URL, then feeds that {@code url_norm} to * time prioritized resolving. When producing {@link SolrDocument}s, a normalised version of the original URL is @@ -1007,27 +1007,27 @@ private Stream findNearestDocumentLenientSingleTake( filter(urlPair -> !direct.containsKey(urlPair.first())); List> lenientURLPairs = // [originalURL, lenientResolvedNormURL] - resolveURLsLenient(Collections.singletonList("url_norm"), unresolved, filterQueries). - entrySet().stream(). - filter(entry -> entry.getValue().containsKey("url_norm")). - map(entry -> new Pair<>( + resolveURLsLenient(Collections.singletonList("url_norm"), unresolved, filterQueries) + .entrySet().stream() + .filter(entry -> entry.getValue().containsKey("url_norm")) + .map(entry -> new Pair<>( entry.getKey(), - Objects.toString(entry.getValue().getFieldValue("url_norm")))). - collect(Collectors.toList()); + Objects.toString(entry.getValue().getFieldValue("url_norm")))) + .collect(Collectors.toList()); // Use the leniently resolved url_norm for time-proximity lookup Map lenient = resolveURLsDirect(allFields, idealTime, lenientURLPairs, filterQueries); // Merge the results from direct and lenient and enrich with the SolrDocuments with originalURL - return urlPairs.stream(). - map(Pair::first). // originalURL - map(originalURL -> getValueFromMaps(originalURL, direct, lenient)). - filter(Objects::nonNull). - peek(resultPair -> resultPair.second().setField("originalURL", resultPair.first())). - peek(resultPair -> resultPair.second().setField( - "url_norm", UrlUtils.punyCodeAndNormaliseUrlSafe(resultPair.first()))). - map(Pair::second); + return urlPairs.stream() + .map(Pair::first) // originalURL + .map(originalURL -> getValueFromMaps(originalURL, direct, lenient)) + .filter(Objects::nonNull) + .peek(resultPair -> resultPair.second().setField("originalURL", resultPair.first())) + .peek(resultPair -> resultPair.second().setField( + "url_norm", UrlUtils.punyCodeAndNormaliseUrlSafe(resultPair.first()))) + .map(Pair::second); } /** @@ -1037,11 +1037,11 @@ private Stream findNearestDocumentLenientSingleTake( * @return a list of {@code [originalURL, normURL]}. */ private List> getNormalisedURLs(Stream urls) { - return urls. - map(url -> new Pair<>(url, UrlUtils.punyCodeAndNormaliseUrlSafe(url))). - filter(urlPair -> Objects.nonNull(urlPair.second())). - distinct(). - collect(Collectors.toList()); + return urls + .map(url -> new Pair<>(url, UrlUtils.punyCodeAndNormaliseUrlSafe(url))) + .filter(urlPair -> Objects.nonNull(urlPair.second())) + .distinct() + .collect(Collectors.toList()); } /** @@ -1053,12 +1053,12 @@ private List> getNormalisedURLs(Stream urls) { */ @SafeVarargs private static Pair getValueFromMaps(String key, Map... maps) { - return Arrays.stream(maps). - map(map -> map.get(key)). - filter(Objects::nonNull). - map(map -> new Pair<>(key, map)). - findFirst(). - orElse(null); + return Arrays.stream(maps) + .map(map -> map.get(key)) + .filter(Objects::nonNull) + .map(map -> new Pair<>(key, map)) + .findFirst() + .orElse(null); } public static void mergeInto(SolrDocumentList main, SolrDocumentList additional) { @@ -1085,17 +1085,17 @@ private SolrDocumentList groupsToDoc(QueryResponse rsp) { return docs; } - /** - * Creates a query for 1 or more URLs, taking care to quote URLs and escape characters where needed. - * The result will be in the form {@code field:("url1" OR "url2")} or {@code field:("url1" AND "url2")} - * depending on operator. - *

- * Note: {@code data:}-URLs are ignored as they will never match. - * @param field the field to query. Typically {@code url} or {@code url_norm}. - * @param operator {@code AND} or {@code OR}. - * @param urls the URLs to create a query for. - * @return a query for the given URLs. - */ + /** + * Creates a query for 1 or more URLs, taking care to quote URLs and escape characters where needed. + * The result will be in the form {@code field:("url1" OR "url2")} or {@code field:("url1" AND "url2")} + * depending on operator. + *

+ * Note: {@code data:}-URLs are ignored as they will never match. + * @param field the field to query. Typically {@code url} or {@code url_norm}. + * @param operator {@code AND} or {@code OR}. + * @param urls the URLs to create a query for. + * @return a query for the given URLs. + */ @SuppressWarnings("SameParameterValue") private String urlQueryJoin(String field, String operator, Iterable urls) { StringBuilder sb = new StringBuilder(); @@ -1103,8 +1103,8 @@ private String urlQueryJoin(String field, String operator, Iterable urls sb.append(field).append(":("); for (String url : urls) { if (url.startsWith("data:") ) { - continue; - } + continue; + } if (!first) { sb.append(" ").append(operator).append(" "); } @@ -1115,7 +1115,7 @@ private String urlQueryJoin(String field, String operator, Iterable urls return sb.toString(); } - + /* * Notice here do we not fix url_norm */ @@ -1128,10 +1128,10 @@ public IndexDoc findClosestHarvestTimeForUrl(String url, String timeStamp) throw // normalize will remove last slash if not slashpage boolean slashLast = url.endsWith("/"); - String urlNormQuery = UrlUtils.fixLegacyNormaliseUrlErrorQuery(url); - + String urlNormQuery = UrlUtils.fixLegacyNormaliseUrlErrorQuery(url); + String query = urlNormQuery +" AND status_code:200"; //Maybe also allow 400 and 404?: (status_code:200 OR status_code:400 OR status_code:404). - + SolrQuery solrQuery = new SolrQuery(); solrQuery.setQuery(query); @@ -1147,7 +1147,7 @@ public IndexDoc findClosestHarvestTimeForUrl(String url, String timeStamp) throw // other methods in this class, but not as critical there. // Hoping for a solr fix.... solrQuery.setRows(10); - QueryResponse rsp = loggedSolrQuery( + QueryResponse rsp = loggedSolrQuery( String.format("findClosestHarvestTimeForUrl(url='%s', timestamp=%s)", url.length() > 50 ? url.substring(0, 50) + "..." : url, timeStamp), solrQuery); @@ -1232,7 +1232,7 @@ public SolrQuery buildSolrQueryForPeriod(String query, String startDate, String public Long countTextHtmlForPeriod(String query, String startDate, String endDate) throws Exception { SolrQuery solrQuery = buildSolrQueryForPeriod(query, startDate, endDate); solrQuery.add("fq", SolrUtils.NO_REVISIT_FILTER); // do not include record_type:revisit - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); return rsp.getResults().getNumFound(); } @@ -1241,7 +1241,7 @@ public Long countTagHtmlForPeriod(String query, String startDate, String endDate throw new InvalidArgumentServiceException("Tag syntax not accepted:" + query); } SolrQuery solrQuery = buildSolrQueryForPeriod("elements_used:\"" + query + "\"", startDate, endDate); - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); return rsp.getResults().getNumFound(); } @@ -1261,7 +1261,7 @@ public HashMap getYearHtmlFacets(String query) throws Exception { solrQuery.set("facet.field", "crawl_year"); solrQuery.set("facet.sort", "index"); solrQuery.set("facet.limit", "500"); // 500 is higher than number of different years - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); FacetField facetField = rsp.getFacetField("crawl_year"); @@ -1282,7 +1282,7 @@ public IndexDoc findExactMatchPWID(String url, String utc) throws Exception { solrQuery.setRows(1); // 1 page only solrQuery.add("fl", SolrUtils.indexDocFieldList); - QueryResponse rsp = loggedSolrQuery("pwidQuery", solrQuery); + QueryResponse rsp = loggedSolrQuery("pwidQuery", solrQuery); SolrDocumentList docs = rsp.getResults(); if (docs.size() == 0) { @@ -1292,7 +1292,7 @@ public IndexDoc findExactMatchPWID(String url, String utc) throws Exception { IndexDoc indexDoc = SolrUtils.solrDocument2IndexDoc(docs.get(0)); return indexDoc; } - + // Not used anymore public HashMap getYearFacetsHtmlAll() throws Exception { // facet=true&facet.field=crawl_year&facet.sort=index&facet.limit=500 @@ -1308,7 +1308,7 @@ public HashMap getYearFacetsHtmlAll() throws Exception { solrQuery.add("fq","content_type_norm:html"); // only html pages solrQuery.add("fq", SolrUtils.NO_REVISIT_FILTER); // do not include record_type:revisit - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); FacetField facetField = rsp.getFacetField("crawl_year"); @@ -1340,7 +1340,7 @@ public HashMap getYearTextHtmlFacets(String query) throws Excepti solrQuery.add("fq","content_type_norm:html"); // only html pages solrQuery.add("fq", SolrUtils.NO_REVISIT_FILTER); // do not include record_type:revisit - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); FacetField facetField = rsp.getFacetField("crawl_year"); @@ -1363,7 +1363,7 @@ public ArrayList findNearestForResourceNameAndDomain(String domain, St solrQuery.set("group.sort", "abs(sub(ms(" + timeStamp + "), crawl_date)) asc"); solrQuery.add("fl", SolrUtils.indexDocFieldList); solrQuery.setFilterQueries(SolrUtils.NO_REVISIT_FILTER); // No binary for revists. - QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); + QueryResponse rsp = solrServer.query(solrQuery, METHOD.POST); SolrDocumentList docs = groupsToDoc(rsp); return SolrUtils.solrDocList2IndexDoc(docs); } @@ -1400,14 +1400,14 @@ public String searchJsonResponseOnlyFacets(String query, List fq, boolea } } - + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); QueryRequest req = new QueryRequest(solrQuery); req.setResponseParser(rawJsonResponseParser); - - NamedList resp = solrServer.request(req); + + NamedList resp = solrServer.request(req); String jsonResponse = (String) resp.get("response"); return jsonResponse; } @@ -1445,14 +1445,14 @@ public String searchJsonResponseOnlyFacetsLoadMore( String query, List f } } - - + + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); QueryRequest req = new QueryRequest(solrQuery); req.setResponseParser(rawJsonResponseParser); - + NamedList resp = solrServer.request(req); String jsonResponse = (String) resp.get("response"); return jsonResponse; @@ -1501,8 +1501,8 @@ public String searchJsonResponseNoFacets(String query, List fq, boolean solrQuery.add("fq", filter); } } - - + + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); @@ -1516,7 +1516,7 @@ public String searchJsonResponseNoFacets(String query, List fq, boolean /* * field list is a comma seperated list of fields. If null all fields will loaded - * + * */ public String idLookupResponse(String id, String fieldList) throws Exception { SolrQuery solrQuery = new SolrQuery(); @@ -1526,17 +1526,17 @@ public String idLookupResponse(String id, String fieldList) throws Exception { solrQuery.set("q.op", "AND"); solrQuery.set("indent", "true"); solrQuery.set("facet", "false"); - + if (fieldList!= null) { - solrQuery.set("fl",fieldList); + solrQuery.set("fl",fieldList); } - + NoOpResponseParser rawJsonResponseParser = new NoOpResponseParser(); rawJsonResponseParser.setWriterType("json"); QueryRequest req = new QueryRequest(solrQuery); req.setResponseParser(rawJsonResponseParser); - NamedList resp = solrServer.request(req); + NamedList resp = solrServer.request(req); String jsonResponse = (String) resp.get("response"); return jsonResponse; } @@ -1569,7 +1569,7 @@ public DomainStatistics domainStatistics(String domain, String startDate, String solrQuery.add("stats", "true"); solrQuery.add("stats.field", "{!count=true cardinality=true}url_norm"); // Important, use cardinality and not unique. solrQuery.add("stats.field", "{!sum=true}content_length"); - QueryResponse rsp = solrServer.query(solrQuery); + QueryResponse rsp = solrServer.query(solrQuery); Map statsMap = rsp.getFieldStatsInfo(); FieldStatsInfo statsUrl_norm = statsMap.get("url_norm"); @@ -1611,13 +1611,13 @@ public String domainStatisticsForQuery(String query, List fq) throws Exc solrQuery.setQuery(query); solrQuery.setRows(0); solrQuery.set("facet", "false"); - + // default scale (by year) int startYear = PropertiesLoaderWeb.ARCHIVE_START_YEAR; int endYear = LocalDate.now().getYear() + 1; // add one since it is not incluced solrQuery.setParam("json.facet", - "{domains:{type:terms,field:domain,limit:30,facet:{years:{type:range,field:crawl_year,start:" + startYear + ",end:" + endYear + ",gap:1}}}}"); + "{domains:{type:terms,field:domain,limit:30,facet:{years:{type:range,field:crawl_year,start:" + startYear + ",end:" + endYear + ",gap:1}}}}"); for (String filter : fq) { solrQuery.addFilterQuery(filter); @@ -1652,7 +1652,7 @@ public String domainStatisticsForQuery(String query, List fq, String sta String end = enddate + "T23:59:59Z"; String gap = getGapFromScale(scale); solrQuery.setParam("json.facet", - "{domains:{type:terms,field:domain,limit:30,facet:{years:{type:range,field:crawl_date,start:'"+ start + "',end:'"+ end + "',gap:'"+ gap + "'}}}}"); + "{domains:{type:terms,field:domain,limit:30,facet:{years:{type:range,field:crawl_date,start:'"+ start + "',end:'"+ end + "',gap:'"+ gap + "'}}}}"); solrQuery.addFilterQuery("crawl_date:[" + start + " TO " + end + "]"); for (String filter : fq) { @@ -1671,7 +1671,7 @@ public String domainStatisticsForQuery(String query, List fq, String sta /** * Determine the gap for Solr from the time scale - * + * * @param scale the time scale * @return the gap */ @@ -1700,7 +1700,7 @@ public static long getOffset(SolrDocument doc) { return (Long) doc.get("source_file_offset"); } - private static String normalizeUrl(String url) { + private static String normalizeUrl(String url) { return Normalisation.canonicaliseURL(url); } @@ -1738,7 +1738,7 @@ private QueryResponse loggedSolrQuery(String caller, SolrQuery solrQuery) throws * @return the result of issuing the query. */ public static QueryResponse query(SolrQuery solrQuery, boolean useCachingClient) { - try { + try { return useCachingClient ? solrServer.query(solrQuery, METHOD.POST) : noCacheSolrServer.query(solrQuery, METHOD.POST); @@ -1755,7 +1755,7 @@ public static QueryResponse query(SolrQuery solrQuery, boolean useCachingClient) public long getLenientAttempts() { return lenientAttempts.get(); } - + /** * @return the number of successful attempts for resolving an URL leniently with extended argument query. */ From 5cbbdd98a04f626926ceb5803a076f21cf1fbf8b Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Mon, 16 Oct 2023 20:53:20 +0200 Subject: [PATCH 06/10] Trim the excessive logged stack trace for missing WARC files --- .../solrwayback/interfaces/ArcSource.java | 15 +++++++++++++-- .../parsers/ArcParserFileResolver.java | 14 +++++++++----- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcSource.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcSource.java index 281734b1..6e981b45 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcSource.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/interfaces/ArcSource.java @@ -16,9 +16,12 @@ import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader; import dk.kb.netarchivesuite.solrwayback.util.SkippingHTTPInputStream; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; import java.io.FileInputStream; +import java.io.FileNotFoundException; import java.io.IOException; import java.io.InputStream; import java.net.MalformedURLException; @@ -36,6 +39,8 @@ * {@code warc.file.resolver.source.http.readfallback=true} */ public class ArcSource implements Supplier { + private static final Logger log = LoggerFactory.getLogger(ArcSource.class); + private static final Pattern HTTP = Pattern.compile("^https?://.*"); private static final Pattern FILE = Pattern.compile("^file://.*"); @@ -81,8 +86,12 @@ public static ArcSource fromFile(String file) { try { // TODO: Verify that Files.newInputStream supports efficient skipping then switch to that return new FileInputStream(file); - } catch (IOException e) { - throw new RuntimeException("Unable to create FileInputStream for '" + file + "'", e); + } catch (FileNotFoundException e) { + log.error("FileNotFoundException trying to access (W)ARC '{}'", file); + throw new RuntimeException("FileNotFoundException trying to access (W)ARC '" + file + "'", e); + } catch (Exception e) { + log.error("Unable to create FileInputStream for (W)ARC '" + file + "'", e); + throw new RuntimeException("Unable to create FileInputStream for (W)ARC '" + file + "'", e); } }); } @@ -106,6 +115,8 @@ public static ArcSource fromHTTP(String httpURL) { try { return new SkippingHTTPInputStream(url, PropertiesLoader.WARC_SOURCE_HTTP_FALLBACK); } catch (IOException e) { + // TODO: This could be extended with a check for 404 for better error message + log.error("Unable to open stream for '" + httpURL + "'", e); throw new RuntimeException("Unable to open stream for '" + httpURL + "'", e); } }); diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcParserFileResolver.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcParserFileResolver.java index 2ebcee32..319a9dd3 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcParserFileResolver.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/parsers/ArcParserFileResolver.java @@ -1,15 +1,15 @@ package dk.kb.netarchivesuite.solrwayback.parsers; -import java.util.HashMap; - +import dk.kb.netarchivesuite.solrwayback.interfaces.ArcFileLocationResolverInterface; import dk.kb.netarchivesuite.solrwayback.interfaces.ArcSource; import dk.kb.netarchivesuite.solrwayback.interfaces.RewriteLocationResolver; +import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; +import dk.kb.netarchivesuite.solrwayback.service.exception.NotFoundServiceException; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import dk.kb.netarchivesuite.solrwayback.interfaces.ArcFileLocationResolverInterface; -import dk.kb.netarchivesuite.solrwayback.interfaces.IdentityArcFileResolver; -import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry; +import java.io.FileNotFoundException; +import java.util.HashMap; /* * This class will resolve the arc-file location using source_file_path from the index. @@ -57,6 +57,10 @@ public static ArcEntry getArcEntry(String source_file_path_org, long offset) thr return ArcFileParserFactory.getArcEntry(arcSource, offset); } catch (Exception e) { + if (e instanceof RuntimeException && e.getCause() instanceof FileNotFoundException) { + // The only thing throwing FileNotFoundExceptions should be ArcSource.get and that already logs errors + throw new NotFoundServiceException("Unable to locate (W)ARC '" + source_file_path + "'"); + } // It CAN happen, but crazy unlikely, and not critical at all... (took 10 // threads spamming 1M+ requests/sec for it to happen in a test.): log.error("Critical error resolving warc:" + source_file_path + " and offset:" + offset + " Error:" + e.getMessage()); From b304d3f59a6110bb3fa32ebffc33cc7e13a74865 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Mon, 16 Oct 2023 21:32:54 +0200 Subject: [PATCH 07/10] Change CSV export to use ISO-8601 for time and not quote numbers. This closes #323 --- .../solrwayback/export/GenerateCSV.java | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java index c1576302..d93a782c 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java @@ -1,13 +1,16 @@ package dk.kb.netarchivesuite.solrwayback.export; +import java.util.Date; import java.util.List; +import dk.kb.netarchivesuite.solrwayback.util.DateUtils; import org.apache.solr.common.SolrDocument; /** * Created by teg on 10/28/16. */ +// TODO: This would be better with a dedicated CSV writer to handle the different types and escaping properly public class GenerateCSV { private static String NEWLINE="\n"; @@ -72,8 +75,15 @@ public static void generateLine(StringBuffer buffer,SolrDocument doc, String[] if (field_value instanceof List) { //if multivalued field_value = String.join(MULTIVALUE_SEPARATOR, (List) field_value); } - String escaped = escapeQuotes(field_value.toString()); - result.append(escaped); + if (field_value instanceof String) { + String escaped = escapeQuotes(field_value.toString()); + result.append(escaped); + } else if (field_value instanceof Date) { + result.append(escapeQuotes(DateUtils.getSolrDate((Date) field_value))); + } else { + // Numbers and boolean + result.append(field_value); + } } else { result.append(escapeQuotes("")); } From 3d0408922d0d2882893516f139d18976cea17a2b Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Oct 2023 06:48:43 +0200 Subject: [PATCH 08/10] Clean ip if-else chain and align the code style for escaping Strings --- .../solrwayback/export/GenerateCSV.java | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java index d93a782c..005da225 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java @@ -73,15 +73,14 @@ public static void generateLine(StringBuffer buffer,SolrDocument doc, String[] if (field_value != null) { //if null, just output a tab if (field_value instanceof List) { //if multivalued - field_value = String.join(MULTIVALUE_SEPARATOR, (List) field_value); - } - if (field_value instanceof String) { - String escaped = escapeQuotes(field_value.toString()); - result.append(escaped); + result.append(String.join(MULTIVALUE_SEPARATOR, (List) field_value)); + } else if (field_value instanceof String) { + result.append(escapeQuotes(field_value.toString())); } else if (field_value instanceof Date) { + // Dates formatted to ISO-8601 with second granularity result.append(escapeQuotes(DateUtils.getSolrDate((Date) field_value))); } else { - // Numbers and boolean + // Numbers and boolean are appended directly (no quotes) result.append(field_value); } } else { From b772a7807a87d3e06c45b229ebd998d1a1c6c66d Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Oct 2023 09:26:39 +0200 Subject: [PATCH 09/10] RestrictedSolrclient extended to handle QueryRequests --- .../solr/RestrictedSolrClient.java | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java index b9febfee..f91d7d4c 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/solr/RestrictedSolrClient.java @@ -21,6 +21,7 @@ import org.apache.solr.client.solrj.StreamingResponseCallback; import org.apache.solr.client.solrj.beans.DocumentObjectBinder; import org.apache.solr.client.solrj.impl.HttpSolrClient; +import org.apache.solr.client.solrj.request.QueryRequest; import org.apache.solr.client.solrj.response.QueryResponse; import org.apache.solr.client.solrj.response.SolrPingResponse; import org.apache.solr.client.solrj.response.UpdateResponse; @@ -159,9 +160,26 @@ private SolrParams restrict(SolrParams params) { */ @SuppressWarnings("rawtypes") private SolrRequest restrict(SolrRequest request) { - log.error("restrict(SolrRequest) called, but is not implemented yet, " + - "as it was alledgedly not used in SolrWayback"); - throw new UnsupportedOperationException("Restriction of SolrRequests not supported yet"); +// if (fixedParams == null || fixedParams.isEmpty()) { +// return request; +// } + if (request instanceof QueryRequest) { + QueryRequest oldQR = (QueryRequest)request; + QueryRequest newQR = new QueryRequest(restrict(oldQR.getParams()), oldQR.getMethod()); + newQR.setPath(oldQR.getPath()); + newQR.setResponseParser(oldQR.getResponseParser()); + newQR.setBasicAuthCredentials(oldQR.getBasicAuthUser(), oldQR.getBasicAuthPassword()); + newQR.setStreamingResponseCallback(oldQR.getStreamingResponseCallback()); + // newQR.setUseBinaryV2 // TODO: Problem: We cannot get this value from oldQR + // newQR.setUseV2 // TODO: Problem: We cannot get this value from oldQR + // newQR.setQueryParams // Derived from params so don't change these + return newQR; + } + log.error("restrict(SolrRequest) called with a SolrRequest that was not a QueryRequest. " + + "This is not implemented yet, as it was alledgedly not used in SolrWayback", + new RuntimeException("Stacktrace")); + throw new UnsupportedOperationException( + "Restriction of SolrRequests that are not QueryRequests not supported yet"); } /* Delegates below where restrict(...) and defaultCollection are applied when possible */ @@ -474,9 +492,14 @@ public SolrDocumentList getById(Collection ids, SolrParams params) throw @Override public NamedList request(SolrRequest request, String collection) throws SolrServerException, IOException { - return inner.request(restrict(request), collection); + return inner.request(restrict(request), collection == null ? defaultCollection : collection); } + // Here we should override the collection-less method in SorClient. But it is final!? + // In reality it is not a problem as is (always?) redirects to request(request, null) + // public final NamedList request(final SolrRequest request) throws SolrServerException, IOException + + @Override public DocumentObjectBinder getBinder() { return inner.getBinder(); From 94abc1f0b8459a556127c86cae34e5856a351722 Mon Sep 17 00:00:00 2001 From: Toke Eskildsen Date: Tue, 17 Oct 2023 11:59:23 +0200 Subject: [PATCH 10/10] Bugfix: The updated CSV code did not quote lists --- .../dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java b/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java index 005da225..e80f1291 100644 --- a/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java +++ b/src/main/java/dk/kb/netarchivesuite/solrwayback/export/GenerateCSV.java @@ -73,7 +73,7 @@ public static void generateLine(StringBuffer buffer,SolrDocument doc, String[] if (field_value != null) { //if null, just output a tab if (field_value instanceof List) { //if multivalued - result.append(String.join(MULTIVALUE_SEPARATOR, (List) field_value)); + result.append(escapeQuotes(String.join(MULTIVALUE_SEPARATOR, (List) field_value))); } else if (field_value instanceof String) { result.append(escapeQuotes(field_value.toString())); } else if (field_value instanceof Date) {