Skip to content

Commit

Permalink
Merge pull request #30 from waterflow80/sorted-name-length-pair
Browse files Browse the repository at this point in the history
added the sorted-name-length-pair attribute to the model
  • Loading branch information
waterflow80 authored Aug 17, 2023
2 parents 2b5e1fb + 1438fb2 commit 6af09f5
Show file tree
Hide file tree
Showing 8 changed files with 142 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import org.hibernate.annotations.TypeDef;
import org.hibernate.annotations.TypeDefs;

import uk.ac.ebi.eva.evaseqcol.model.NameLengthPairEntity;
import uk.ac.ebi.eva.evaseqcol.refget.SHA512ChecksumCalculator;
import uk.ac.ebi.eva.evaseqcol.utils.JSONExtData;

Expand All @@ -18,6 +19,7 @@
import javax.persistence.Transient;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Comparator;
import java.util.LinkedList;
import java.util.List;

Expand Down Expand Up @@ -47,7 +49,7 @@ public class SeqColExtendedDataEntity {
private SeqColEntity.NamingConvention namingConvention;

public enum AttributeType {
names, sequences, md5DigestsOfSequences, lengths
names, sequences, md5DigestsOfSequences, lengths, sortedNameLengthPairs
}

public SeqColExtendedDataEntity setAttributeType(AttributeType attributeType) {
Expand Down Expand Up @@ -110,7 +112,8 @@ public static SeqColExtendedDataEntity constructSeqColLengthsObject(AssemblyEnti

/**
* Return the seqCol sequences array object*/
public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySequenceEntity assemblySequenceEntity) {
public static SeqColExtendedDataEntity constructSeqColSequencesObject(
AssemblySequenceEntity assemblySequenceEntity) {
SeqColExtendedDataEntity seqColSequencesObject = new SeqColExtendedDataEntity().setAttributeType(
SeqColExtendedDataEntity.AttributeType.sequences);
JSONExtData seqColSequencesArray = new JSONExtData();
Expand All @@ -128,7 +131,8 @@ public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySe

/**
* Return the seqCol sequences array object*/
public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(AssemblySequenceEntity assemblySequenceEntity) {
public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(
AssemblySequenceEntity assemblySequenceEntity) {
SeqColExtendedDataEntity seqColSequencesObject = new SeqColExtendedDataEntity().setAttributeType(
AttributeType.md5DigestsOfSequences);
JSONExtData seqColSequencesArray = new JSONExtData();
Expand All @@ -144,6 +148,63 @@ public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(Assembl
return seqColSequencesObject;
}

/**
* Return the seqCol sorted-name-length-pairs extended object*/
public static SeqColExtendedDataEntity constructSeqColSortedNameLengthPairs(
SeqColExtendedDataEntity extendedNames, SeqColExtendedDataEntity extendedLengths) {
if (extendedNames.getExtendedSeqColData().getObject().size() != extendedLengths.getExtendedSeqColData().getObject().size()) {
return null; // Names and Lengths entities are not compatible
}
SeqColExtendedDataEntity SeqColSortedNameLengthPairsObject = new SeqColExtendedDataEntity().setAttributeType(
AttributeType.sortedNameLengthPairs);
JSONExtData seqColSortedNameLengthPairsArray = new JSONExtData();

// Get the plain name-length pairs
List<NameLengthPairEntity> nameLengthPairList = constructNameLengthPairList(extendedNames, extendedLengths);
// Get the sorted list
List<String> sortedNameLengthPairsList = constructSortedNameLengthPairs(nameLengthPairList);

SHA512ChecksumCalculator sha512ChecksumCalculator = new SHA512ChecksumCalculator();
seqColSortedNameLengthPairsArray.setObject(sortedNameLengthPairsList);
SeqColSortedNameLengthPairsObject.setExtendedSeqColData(seqColSortedNameLengthPairsArray);
SeqColSortedNameLengthPairsObject.setDigest(sha512ChecksumCalculator.calculateChecksum(
seqColSortedNameLengthPairsArray.toString()));
return SeqColSortedNameLengthPairsObject;
}

/**
* Retrieve and construct the list of name-length pairs*/
private static List<NameLengthPairEntity> constructNameLengthPairList(
SeqColExtendedDataEntity extendedNames, SeqColExtendedDataEntity extendedLengths) {
List<NameLengthPairEntity> nameLengthPairList = new ArrayList<>();
for (int i=0; i<extendedNames.getExtendedSeqColData().getObject().size(); i++) {
String name = extendedNames.getExtendedSeqColData().getObject().get(i);
String length = extendedLengths.getExtendedSeqColData().getObject().get(i);
nameLengthPairList.add(new NameLengthPairEntity(name, length));
}
return nameLengthPairList;
}

/**
* Return the sorted-name-length-pair list for the given list of nameLengthPairEntity*/
public static List<String> constructSortedNameLengthPairs(List<NameLengthPairEntity> nameLengthPairList) {
SHA512ChecksumCalculator sha512ChecksumCalculator = new SHA512ChecksumCalculator();
List<String> sortedNameLengthPairs = new ArrayList<>();
for (NameLengthPairEntity entity: nameLengthPairList) {
String nameLengthHash = sha512ChecksumCalculator.calculateChecksum(entity.toString());
sortedNameLengthPairs.add(nameLengthHash);
}
// Sorting the name-length-pair list according to the elements' natural order (alphanumerically)
Comparator<String> nameLengthComparator = new Comparator<String>() {
@Override
public int compare(String o1, String o2) {
return o1.toLowerCase().compareTo(o2.toLowerCase());
}
};
sortedNameLengthPairs.sort(nameLengthComparator);
return sortedNameLengthPairs;
}

/**
* Return the list of extended data entities that are the same across multiple seqCol objects under
* the same assembly accession. These entities are "sequences", "md5Sequences" and "lengths". */
Expand All @@ -159,7 +220,8 @@ public static List<SeqColExtendedDataEntity> constructSameValueExtendedSeqColDat
/**
* Return a list of seqCol sequences' names with all possible naming convention that can be extracted
* from the given assemblyEntity*/
public static List<SeqColExtendedDataEntity> constructAllPossibleExtendedNamesSeqColData(AssemblyEntity assemblyEntity) {
public static List<SeqColExtendedDataEntity> constructAllPossibleExtendedNamesSeqColData(
AssemblyEntity assemblyEntity) {
List<SeqColEntity.NamingConvention> existingNamingConventions = new ArrayList<>();
if (assemblyEntity.getChromosomes().get(0).getEnaSequenceName() != null) {
existingNamingConventions.add(SeqColEntity.NamingConvention.ENA);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,8 @@ public class SeqColLevelTwoEntity extends SeqColEntity{
private List<String> lengths;
@JsonProperty("md5-sequences")
private List<String> md5DigestsOfSequences;
@JsonProperty("sorted-name-length-pairs")
private List<String> sortedNameLengthPairs;

public SeqColLevelTwoEntity setDigest(String digest) {
this.digest = digest;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
package uk.ac.ebi.eva.evaseqcol.model;

import lombok.AllArgsConstructor;
import lombok.Data;
import lombok.NoArgsConstructor;

@Data
@NoArgsConstructor
public class NameLengthPairEntity {
private String name;
private String length;

public NameLengthPairEntity(String name, String length) {
this.name = name;
this.length = length;
}

@Override
public String toString() {
return "{" +
" \"name\":\""+ name +"\"," +
" \"length\":\""+ length +"\"," +
"}";
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -92,14 +92,22 @@ public Optional<SeqColExtendedDataEntity> getExtendedAttributeByDigest(String di
}

/**
* Return the 3 extended data objects (names, lengths, sequences and sequencesMD5) of the given naming convention*/
public List<SeqColExtendedDataEntity> constructExtendedSeqColDataList(AssemblyEntity assemblyEntity, AssemblySequenceEntity assemblySequenceEntity,
SeqColEntity.NamingConvention convention) throws IOException {
* Return the 5 seqCol extended data objects (names, lengths, sequences, sequencesMD5 and sorted-name-length-pair)
* of the given assembly and naming convention*/
public List<SeqColExtendedDataEntity> constructExtendedSeqColDataList(
AssemblyEntity assemblyEntity, AssemblySequenceEntity assemblySequenceEntity,
SeqColEntity.NamingConvention convention) {
SeqColExtendedDataEntity extendedLengthsEntity = SeqColExtendedDataEntity
.constructSeqColLengthsObject(assemblyEntity);
SeqColExtendedDataEntity extendedNamesEntity = SeqColExtendedDataEntity
.constructSeqColNamesObjectByNamingConvention(assemblyEntity, convention);

return Arrays.asList(
SeqColExtendedDataEntity.constructSeqColSequencesObject(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(assemblySequenceEntity),
SeqColExtendedDataEntity.constructSeqColNamesObjectByNamingConvention(assemblyEntity, convention),
SeqColExtendedDataEntity.constructSeqColLengthsObject(assemblyEntity)
extendedNamesEntity,
extendedLengthsEntity,
SeqColExtendedDataEntity.constructSeqColSortedNameLengthPairs(extendedNamesEntity, extendedLengthsEntity)
);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,9 @@ public SeqColLevelOneEntity constructSeqColLevelOne(List<SeqColExtendedDataEntit
case md5DigestsOfSequences:
jsonLevelOne.setMd5DigestsOfSequences(dataEntity.getDigest());
break;
case sortedNameLengthPairs:
jsonLevelOne.setSortedNameLengthPairs(dataEntity.getDigest());
break;
}
}
levelOneEntity.setSeqColLevel1Object(jsonLevelOne);
Expand All @@ -97,6 +100,7 @@ public SeqColLevelOneEntity constructSeqColLevelOne(
JSONExtData lengthsExtData = new JSONExtData(levelTwoEntity.getLengths());
JSONExtData namesExtData = new JSONExtData(levelTwoEntity.getNames());
JSONExtData md5SequencesExtData = new JSONExtData(levelTwoEntity.getMd5DigestsOfSequences());
JSONExtData sortedNameLengthPairsData = new JSONExtData(levelTwoEntity.getSortedNameLengthPairs());

// Sequences
SeqColExtendedDataEntity sequencesExtEntity = new SeqColExtendedDataEntity();
Expand All @@ -118,12 +122,18 @@ public SeqColLevelOneEntity constructSeqColLevelOne(
namesExtEntity.setAttributeType(SeqColExtendedDataEntity.AttributeType.names);
namesExtEntity.setExtendedSeqColData(namesExtData);
namesExtEntity.setDigest(sha512Calculator.calculateChecksum(namesExtData.toString()));
//sorted-name-length-pairs
SeqColExtendedDataEntity sortedNameLengthPairsExtEntity = new SeqColExtendedDataEntity();
sortedNameLengthPairsExtEntity.setAttributeType(SeqColExtendedDataEntity.AttributeType.sortedNameLengthPairs);
sortedNameLengthPairsExtEntity.setExtendedSeqColData(sortedNameLengthPairsData);
sortedNameLengthPairsExtEntity.setDigest(sha512Calculator.calculateChecksum(sortedNameLengthPairsData.toString()));

List<SeqColExtendedDataEntity> extendedDataEntities = Arrays.asList(
sequencesExtEntity,
md5SequencesExtEntity,
lengthsExtEntity,
namesExtEntity
namesExtEntity,
sortedNameLengthPairsExtEntity
);
return constructSeqColLevelOne(extendedDataEntities, convention);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,9 @@ public SeqColLevelTwoEntity constructSeqColL2(String level0Digest, List<SeqColEx
case md5DigestsOfSequences:
levelTwoEntity.setMd5DigestsOfSequences(extendedData.getExtendedSeqColData().getObject());
break;
case sortedNameLengthPairs:
levelTwoEntity.setSortedNameLengthPairs(extendedData.getExtendedSeqColData().getObject());
break;
}
}
return levelTwoEntity;
Expand Down
20 changes: 20 additions & 0 deletions src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,16 @@ public Optional<? extends SeqColEntity> getSeqColByDigestAndLevel(String digest,
String namesDigest = seqColLevelOne.get().getSeqColLevel1Object().getNames();
JSONExtData extendedNames = extendedDataService.getSeqColExtendedDataEntityByDigest(namesDigest).get().getExtendedSeqColData();

// Retrieving sortedNameLengthPairs
String sortedNameLengthPairsDigest = seqColLevelOne.get().getSeqColLevel1Object().getSortedNameLengthPairs();
JSONExtData extendedSortedNameLengthPairs = extendedDataService.
getSeqColExtendedDataEntityByDigest(sortedNameLengthPairsDigest).get().getExtendedSeqColData();

levelTwoEntity.setSequences(extendedSequences.getObject());
levelTwoEntity.setMd5DigestsOfSequences(extendedMd5Sequnces.getObject());
levelTwoEntity.setLengths(extendedLengths.getObject());
levelTwoEntity.setNames(extendedNames.getObject());
levelTwoEntity.setSortedNameLengthPairs(extendedSortedNameLengthPairs.getObject());

return Optional.of(levelTwoEntity);
} else {
Expand Down Expand Up @@ -128,7 +134,11 @@ public List<String> fetchAndInsertAllSeqColByAssemblyAccession(
List<SeqColExtendedDataEntity> sameValueAttributeList = seqColDataMap.get().get("sameValueAttributes");
for (SeqColExtendedDataEntity extendedNamesEntity: possibleSequencesNamesList) {
List<SeqColExtendedDataEntity> seqColExtendedDataEntities = new ArrayList<>(sameValueAttributeList);
SeqColExtendedDataEntity extendedLengthsEntity = retrieveExtendedLengthEntity(seqColExtendedDataEntities);
SeqColExtendedDataEntity seqColSortedNameLengthPairEntity = SeqColExtendedDataEntity.
constructSeqColSortedNameLengthPairs(extendedNamesEntity, extendedLengthsEntity);
seqColExtendedDataEntities.add(extendedNamesEntity);
seqColExtendedDataEntities.add(seqColSortedNameLengthPairEntity);
SeqColLevelOneEntity levelOneEntity = levelOneService.constructSeqColLevelOne(seqColExtendedDataEntities, extendedNamesEntity.getNamingConvention());
Optional<String> seqColDigest = insertSeqColL1AndL2(levelOneEntity, seqColExtendedDataEntities);
if (seqColDigest.isPresent()) {
Expand All @@ -142,6 +152,16 @@ public List<String> fetchAndInsertAllSeqColByAssemblyAccession(
return insertedSeqColDigests;
}

/**
* Return the extended data entity that corresponds to the seqCol lengths attribute*/
public SeqColExtendedDataEntity retrieveExtendedLengthEntity(List<SeqColExtendedDataEntity> extendedDataEntities) {
for (SeqColExtendedDataEntity entity: extendedDataEntities) {
if (entity.getAttributeType() == SeqColExtendedDataEntity.AttributeType.lengths) {
return entity;
}
}
return null;
}
@Transactional
/**
* Insert the given Level 1 seqCol entity and its corresponding extended level 2 data (names, lengths, sequences, ...)
Expand Down
2 changes: 2 additions & 0 deletions src/main/java/uk/ac/ebi/eva/evaseqcol/utils/JSONLevelOne.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ public class JSONLevelOne implements Serializable {
private String md5DigestsOfSequences;
private String names;
private String lengths;
@JsonProperty("sorted-name-length-pairs")
private String sortedNameLengthPairs;

public JSONLevelOne setSequences(String sequences) {
this.sequences = sequences;
Expand Down

0 comments on commit 6af09f5

Please sign in to comment.