Skip to content

Commit

Permalink
Merge branch 'master' of [email protected]:netarchivesuite/solrwayback.git
Browse files Browse the repository at this point in the history
  • Loading branch information
Thomas Egense committed Oct 19, 2023
2 parents 65b1b5e + dfcabad commit 64f98d1
Show file tree
Hide file tree
Showing 8 changed files with 686 additions and 200 deletions.
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
package dk.kb.netarchivesuite.solrwayback.export;


import java.util.Date;
import java.util.List;

import dk.kb.netarchivesuite.solrwayback.util.DateUtils;
import org.apache.solr.common.SolrDocument;

/**
* Created by teg on 10/28/16.
*/
// TODO: This would be better with a dedicated CSV writer to handle the different types and escaping properly
public class GenerateCSV {

private static String NEWLINE="\n";
Expand Down Expand Up @@ -70,10 +73,16 @@ public static void generateLine(StringBuffer buffer,SolrDocument doc, String[]
if (field_value != null) { //if null, just output a tab

if (field_value instanceof List) { //if multivalued
field_value = String.join(MULTIVALUE_SEPARATOR, (List<String>) field_value);
result.append(escapeQuotes(String.join(MULTIVALUE_SEPARATOR, (List<String>) field_value)));
} else if (field_value instanceof String) {
result.append(escapeQuotes(field_value.toString()));
} else if (field_value instanceof Date) {
// Dates formatted to ISO-8601 with second granularity
result.append(escapeQuotes(DateUtils.getSolrDate((Date) field_value)));
} else {
// Numbers and boolean are appended directly (no quotes)
result.append(field_value);
}
String escaped = escapeQuotes(field_value.toString());
result.append(escaped);
} else {
result.append(escapeQuotes(""));
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,12 @@

import dk.kb.netarchivesuite.solrwayback.properties.PropertiesLoader;
import dk.kb.netarchivesuite.solrwayback.util.SkippingHTTPInputStream;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.FileInputStream;

import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
Expand All @@ -36,6 +39,8 @@
* {@code warc.file.resolver.source.http.readfallback=true}
*/
public class ArcSource implements Supplier<InputStream> {
private static final Logger log = LoggerFactory.getLogger(ArcSource.class);

private static final Pattern HTTP = Pattern.compile("^https?://.*");
private static final Pattern FILE = Pattern.compile("^file://.*");

Expand Down Expand Up @@ -81,8 +86,12 @@ public static ArcSource fromFile(String file) {
try {
// TODO: Verify that Files.newInputStream supports efficient skipping then switch to that
return new FileInputStream(file);
} catch (IOException e) {
throw new RuntimeException("Unable to create FileInputStream for '" + file + "'", e);
} catch (FileNotFoundException e) {
log.error("FileNotFoundException trying to access (W)ARC '{}'", file);
throw new RuntimeException("FileNotFoundException trying to access (W)ARC '" + file + "'", e);
} catch (Exception e) {
log.error("Unable to create FileInputStream for (W)ARC '" + file + "'", e);
throw new RuntimeException("Unable to create FileInputStream for (W)ARC '" + file + "'", e);
}
});
}
Expand All @@ -106,6 +115,8 @@ public static ArcSource fromHTTP(String httpURL) {
try {
return new SkippingHTTPInputStream(url, PropertiesLoader.WARC_SOURCE_HTTP_FALLBACK);
} catch (IOException e) {
// TODO: This could be extended with a check for 404 for better error message
log.error("Unable to open stream for '" + httpURL + "'", e);
throw new RuntimeException("Unable to open stream for '" + httpURL + "'", e);
}
});
Expand Down
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
package dk.kb.netarchivesuite.solrwayback.parsers;

import java.util.HashMap;

import dk.kb.netarchivesuite.solrwayback.interfaces.ArcFileLocationResolverInterface;
import dk.kb.netarchivesuite.solrwayback.interfaces.ArcSource;
import dk.kb.netarchivesuite.solrwayback.interfaces.RewriteLocationResolver;
import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry;
import dk.kb.netarchivesuite.solrwayback.service.exception.NotFoundServiceException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import dk.kb.netarchivesuite.solrwayback.interfaces.ArcFileLocationResolverInterface;
import dk.kb.netarchivesuite.solrwayback.interfaces.IdentityArcFileResolver;
import dk.kb.netarchivesuite.solrwayback.service.dto.ArcEntry;
import java.io.FileNotFoundException;
import java.util.HashMap;

/*
* This class will resolve the arc-file location using source_file_path from the index.
Expand Down Expand Up @@ -57,6 +57,10 @@ public static ArcEntry getArcEntry(String source_file_path_org, long offset) thr
return ArcFileParserFactory.getArcEntry(arcSource, offset);

} catch (Exception e) {
if (e instanceof RuntimeException && e.getCause() instanceof FileNotFoundException) {
// The only thing throwing FileNotFoundExceptions should be ArcSource.get and that already logs errors
throw new NotFoundServiceException("Unable to locate (W)ARC '" + source_file_path + "'");
}
// It CAN happen, but crazy unlikely, and not critical at all... (took 10
// threads spamming 1M+ requests/sec for it to happen in a test.):
log.error("Critical error resolving warc:" + source_file_path + " and offset:" + offset + " Error:" + e.getMessage());
Expand Down
Loading

0 comments on commit 64f98d1

Please sign in to comment.