Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

EVA-3501 - adding new endpoint for parsing fasta file data #80

Merged
merged 5 commits into from
Feb 23, 2024
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import org.springframework.http.ResponseEntity;
import org.springframework.web.bind.annotation.PathVariable;
import org.springframework.web.bind.annotation.PutMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

Expand Down Expand Up @@ -76,4 +77,32 @@ public ResponseEntity<?> fetchAndInsertSeqColByAssemblyAccession(
return new ResponseEntity<>(e.getMessage(), HttpStatus.CONFLICT);
}
}

@Operation(summary = "Add new sequence collection objects",
description = "Given FASTA file content, this endpoint will parse the content and use it to construct " +
"seqCol objects with naming convention TEST and eventually save these seqCol objects into the database. " +
"This is an authenticated endpoint, so it requires admin privileges to run it.")
@ApiResponses(value = {
@ApiResponse(responseCode = "201", description = "seqCol object(s) successfully inserted"),
@ApiResponse(responseCode = "409", description = "seqCol object(s) already exist(s)"),
@ApiResponse(responseCode = "404", description = "Assembly not found"),
@ApiResponse(responseCode = "400", description = "Bad request. (It can be a bad accession value)"),
@ApiResponse(responseCode = "500", description = "Server Error")
})
@PutMapping(value = "/seqcols/fasta/{accession}")
public ResponseEntity<?> fetchAndInsertSeqColByParsingFastaFile(@PathVariable(value = "accession") String accession, @RequestBody String fastaFileContent) {
try {
IngestionResultEntity ingestionResult = seqColService.fetchAndInsertAllSeqColInFastaFile(accession, fastaFileContent);
return new ResponseEntity<>(ingestionResult, HttpStatus.CREATED);
} catch (IOException e) {
e.printStackTrace();
return new ResponseEntity<>(e.getMessage(), HttpStatus.INTERNAL_SERVER_ERROR);
} catch (DuplicateSeqColException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.CONFLICT);
} catch (AssemblyNotFoundException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.NOT_FOUND);
} catch (AssemblyAlreadyIngestedException e) {
return new ResponseEntity<>(e.getMessage(), HttpStatus.CONFLICT);
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import uk.ac.ebi.eva.evaseqcol.entities.AssemblySequenceEntity;
import uk.ac.ebi.eva.evaseqcol.utils.GzipCompress;

import java.io.ByteArrayInputStream;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
Expand Down Expand Up @@ -43,6 +44,17 @@ public NCBIAssemblySequenceDataSource(NCBIBrowserFactory factory,
this.readerFactory = readerFactory;
}

public Optional<AssemblySequenceEntity> getAssemblySequencesByAccession(String accession, String fastaFileContent) throws IOException {
AssemblySequenceEntity assemblySequenceEntity;
try (InputStream stream = new ByteArrayInputStream(fastaFileContent.getBytes())) {
NCBIAssemblySequenceReader reader = readerFactory.build(stream, accession);
assemblySequenceEntity = reader.getAssemblySequencesEntity();
logger.info("FASTA file content with accession " + accession + " has been parsed successfully");
}

return Optional.of(assemblySequenceEntity);
}

@Override
public Optional<AssemblySequenceEntity> getAssemblySequencesByAccession(String accession) throws IOException, IllegalArgumentException {
NCBIBrowser ncbiBrowser = factory.build();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import uk.ac.ebi.eva.evaseqcol.utils.JSONLevelOne;

import java.io.IOException;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
Expand Down Expand Up @@ -86,4 +87,27 @@ public Optional<Map<String, Object>> getAllPossibleSeqColExtendedData(String acc

return Optional.of(seqColResultData);
}

public Optional<Map<String, Object>> getAllPossibleSeqColExtendedData(String accession, String fastaFileContent) throws IOException {
Map<String, Object> seqColResultData = new HashMap<>();

// Fetching Sequence Entity (FASTA File)
Optional<AssemblySequenceEntity> sequenceEntity = assemblySequenceDataSource.getAssemblySequencesByAccession(accession, fastaFileContent);
if (!sequenceEntity.isPresent()) {
logger.error("Could not parse FASTA file content: ");
return Optional.empty();
}
logger.info("FASTA file have been parsed successfully");

Map<String, Object> sameValueAttributesMap = new HashMap<>();
sameValueAttributesMap.put("extendedLengths", SeqColExtendedDataEntity.constructSeqColLengthsObject(sequenceEntity.get()));
sameValueAttributesMap.put("extendedSequences", SeqColExtendedDataEntity.constructSeqColSequencesObject(sequenceEntity.get()));
sameValueAttributesMap.put("extendedMd5Sequences", SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(sequenceEntity.get()));

// Seqcol Result Data Map
seqColResultData.put("sameValueAttributes", sameValueAttributesMap);
seqColResultData.put("namesAttributes", Collections.singletonList(SeqColExtendedDataEntity
.constructSeqColNamesObjectByNamingConvention(sequenceEntity.get(), SeqColEntity.NamingConvention.TEST)));
return Optional.of(seqColResultData);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ protected void parseFile() throws IOException, NullPointerException {
while (line != null){
if (line.startsWith(">")){
SeqColSequenceEntity sequence = new SeqColSequenceEntity();
String refSeq = line.substring(1, line.indexOf(' '));
String refSeq = line.substring(1).split(" ")[0];
sequence.setRefseq(refSeq);
line = reader.readLine();
StringBuilder sequenceValue = new StringBuilder();
Expand All @@ -45,6 +45,7 @@ protected void parseFile() throws IOException, NullPointerException {
String sha512Checksum = sha512ChecksumCalculator.calculateRefgetChecksum(sequenceValue.toString());
sequence.setSequenceMD5(md5checksum);
sequence.setSequence(sha512Checksum);
sequence.setLength(sequenceValue.length());
sequences.add(sequence);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,6 @@ public abstract class SeqColEntity {


public enum NamingConvention {
ENA, GENBANK, UCSC
ENA, GENBANK, UCSC, TEST
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;
import java.util.stream.Collectors;

@Entity
@TypeDefs({
Expand Down Expand Up @@ -117,6 +118,20 @@ public static SeqColExtendedDataEntity<List<String>> constructSeqColNamesObjectB
return seqColNamesObject;
}

public static SeqColExtendedDataEntity<List<String>> constructSeqColNamesObjectByNamingConvention(
tcezard marked this conversation as resolved.
Show resolved Hide resolved
AssemblySequenceEntity sequenceEntity, SeqColEntity.NamingConvention convention) throws IOException {
SeqColExtendedDataEntity<List<String>> seqColNamesObject = new SeqColExtendedDataEntity<List<String>>().setAttributeType(
SeqColExtendedDataEntity.AttributeType.names);
seqColNamesObject.setNamingConvention(convention);
JSONExtData<List<String>> seqColNamesArray = new JSONStringListExtData();
List<String> namesList = sequenceEntity.getSequences().stream().map(s -> s.getRefseq()).collect(Collectors.toList());
DigestCalculator digestCalculator = new DigestCalculator();
seqColNamesArray.setObject(namesList);
seqColNamesObject.setExtendedSeqColData(seqColNamesArray);
seqColNamesObject.setDigest(digestCalculator.getSha512Digest(seqColNamesArray.toString()));
return seqColNamesObject;
}

/**
* Return the seqCol lengths array object*/
public static SeqColExtendedDataEntity<List<Integer>> constructSeqColLengthsObject(AssemblyEntity assemblyEntity) throws IOException {
Expand All @@ -136,6 +151,21 @@ public static SeqColExtendedDataEntity<List<Integer>> constructSeqColLengthsObje
return seqColLengthsObject;
}


public static SeqColExtendedDataEntity<List<Integer>> constructSeqColLengthsObject(AssemblySequenceEntity sequenceEntity) throws IOException {
SeqColExtendedDataEntity<List<Integer>> seqColLengthsObject = new SeqColExtendedDataEntity<List<Integer>>().setAttributeType(
SeqColExtendedDataEntity.AttributeType.lengths);
JSONExtData<List<Integer>> seqColLengthsArray = new JSONIntegerListExtData();
List<Integer> lengthsList = sequenceEntity.getSequences().stream().map(s -> s.getLength()).collect(Collectors.toList());

DigestCalculator digestCalculator = new DigestCalculator();
seqColLengthsArray.setObject(lengthsList);
seqColLengthsObject.setExtendedSeqColData(seqColLengthsArray);
seqColLengthsObject.setDigest(digestCalculator.getSha512Digest(seqColLengthsArray.toString()));

return seqColLengthsObject;
}

/**
* Return the seqCol sequences array object*/
public static SeqColExtendedDataEntity<List<String>> constructSeqColSequencesObject(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ public class SeqColSequenceEntity {
private String sequenceMD5;
@ApiModelProperty(value = "Sequence's defalut (ga4gh) checksum value")
private String sequence;
@ApiModelProperty(value = "Sequence's length")
private Integer length;

public SeqColSequenceEntity setRefseq(String refseq) {
this.refseq = refseq;
Expand All @@ -26,4 +28,9 @@ public SeqColSequenceEntity setSequence(String sequence) {
this.sequence = sequence;
return this;
}

public SeqColSequenceEntity setLength(Integer length) {
this.length = length;
return this;
}
}
18 changes: 13 additions & 5 deletions src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java
Original file line number Diff line number Diff line change
Expand Up @@ -154,16 +154,24 @@ public void removeAllSeqCol() {
extendedDataService.removeAllSeqColExtendedEntities();
}

public IngestionResultEntity fetchAndInsertAllSeqColInFastaFile(String accession, String fastaFileContent) throws IOException {
Optional<Map<String, Object>> seqColDataMap = ncbiSeqColDataSource.getAllPossibleSeqColExtendedData(accession, fastaFileContent);
return createSeqColObjectsAndInsert(seqColDataMap, accession);
}

/**
* Fetch and insert all possible seqCol objects for the given assembly accession.
* NOTE: All possible seqCol objects means with all possible/provided naming conventions that could be found in the
* assembly report.
* Return the list of level 0 digests of the inserted seqcol objects*/
public IngestionResultEntity fetchAndInsertAllSeqColByAssemblyAccession(
String assemblyAccession) throws IOException, DuplicateSeqColException, AssemblyNotFoundException,
AssemblyAlreadyIngestedException{
Optional<Map<String, Object>> seqColDataMap = ncbiSeqColDataSource
.getAllPossibleSeqColExtendedData(assemblyAccession);
public IngestionResultEntity fetchAndInsertAllSeqColByAssemblyAccession(String assemblyAccession) throws IOException {
Optional<Map<String, Object>> seqColDataMap = ncbiSeqColDataSource.getAllPossibleSeqColExtendedData(assemblyAccession);
return createSeqColObjectsAndInsert(seqColDataMap, assemblyAccession);
}


public IngestionResultEntity createSeqColObjectsAndInsert(Optional<Map<String, Object>> seqColDataMap,
String assemblyAccession) throws IOException {
if (!seqColDataMap.isPresent()) {
logger.warn("No seqCol data corresponding to assemblyAccession " + assemblyAccession + " could be found on NCBI datasource");
throw new AssemblyNotFoundException(assemblyAccession);
Expand Down
Loading