From 3c0cca1e039110da887eddcf312bf175dd4a04a3 Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Sun, 13 Aug 2023 23:54:15 +0100 Subject: [PATCH 1/5] added the sorted-name-length-pair attribute to the model --- .../entities/SeqColExtendedDataEntity.java | 70 +++++++++++++++++-- .../entities/SeqColLevelTwoEntity.java | 2 + .../evaseqcol/model/NameLengthPairEntity.java | 25 +++++++ .../service/SeqColLevelOneService.java | 14 +++- .../eva/evaseqcol/service/SeqColService.java | 20 ++++++ .../ebi/eva/evaseqcol/utils/JSONLevelOne.java | 2 + 6 files changed, 127 insertions(+), 6 deletions(-) create mode 100644 src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColExtendedDataEntity.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColExtendedDataEntity.java index bee2431..46652df 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColExtendedDataEntity.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColExtendedDataEntity.java @@ -6,6 +6,7 @@ import org.hibernate.annotations.TypeDef; import org.hibernate.annotations.TypeDefs; +import uk.ac.ebi.eva.evaseqcol.model.NameLengthPairEntity; import uk.ac.ebi.eva.evaseqcol.refget.SHA512ChecksumCalculator; import uk.ac.ebi.eva.evaseqcol.utils.JSONExtData; @@ -18,6 +19,7 @@ import javax.persistence.Transient; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.LinkedList; import java.util.List; @@ -47,7 +49,7 @@ public class SeqColExtendedDataEntity { private SeqColEntity.NamingConvention namingConvention; public enum AttributeType { - names, sequences, md5DigestsOfSequences, lengths + names, sequences, md5DigestsOfSequences, lengths, sortedNameLengthPairs } public SeqColExtendedDataEntity setAttributeType(AttributeType attributeType) { @@ -110,7 +112,8 @@ public static SeqColExtendedDataEntity constructSeqColLengthsObject(AssemblyEnti /** * Return the seqCol sequences array object*/ - public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySequenceEntity assemblySequenceEntity) { + public static SeqColExtendedDataEntity constructSeqColSequencesObject( + AssemblySequenceEntity assemblySequenceEntity) { SeqColExtendedDataEntity seqColSequencesObject = new SeqColExtendedDataEntity().setAttributeType( SeqColExtendedDataEntity.AttributeType.sequences); JSONExtData seqColSequencesArray = new JSONExtData(); @@ -128,7 +131,8 @@ public static SeqColExtendedDataEntity constructSeqColSequencesObject(AssemblySe /** * Return the seqCol sequences array object*/ - public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(AssemblySequenceEntity assemblySequenceEntity) { + public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object( + AssemblySequenceEntity assemblySequenceEntity) { SeqColExtendedDataEntity seqColSequencesObject = new SeqColExtendedDataEntity().setAttributeType( AttributeType.md5DigestsOfSequences); JSONExtData seqColSequencesArray = new JSONExtData(); @@ -144,6 +148,63 @@ public static SeqColExtendedDataEntity constructSeqColSequencesMd5Object(Assembl return seqColSequencesObject; } + /** + * Return the seqCol sorted-name-length-pairs extended object*/ + public static SeqColExtendedDataEntity constructSeqColSortedNameLengthPairs( + SeqColExtendedDataEntity extendedNames, SeqColExtendedDataEntity extendedLengths) { + if (extendedNames.getExtendedSeqColData().getObject().size() != extendedLengths.getExtendedSeqColData().getObject().size()) { + return null; // Names and Lengths entities are not compatible + } + SeqColExtendedDataEntity SeqColSortedNameLengthPairsObject = new SeqColExtendedDataEntity().setAttributeType( + AttributeType.sortedNameLengthPairs); + JSONExtData seqColSortedNameLengthPairsArray = new JSONExtData(); + + // Get the plain name-length pairs + List nameLengthPairList = constructNameLengthPairList(extendedNames, extendedLengths); + // Get the sorted list + List sortedNameLengthPairsList = constructSortedNameLengthPairs(nameLengthPairList); + + SHA512ChecksumCalculator sha512ChecksumCalculator = new SHA512ChecksumCalculator(); + seqColSortedNameLengthPairsArray.setObject(sortedNameLengthPairsList); + SeqColSortedNameLengthPairsObject.setExtendedSeqColData(seqColSortedNameLengthPairsArray); + SeqColSortedNameLengthPairsObject.setDigest(sha512ChecksumCalculator.calculateChecksum( + seqColSortedNameLengthPairsArray.toString())); + return SeqColSortedNameLengthPairsObject; + } + + /** + * Retrieve and construct the list of name-length pairs*/ + private static List constructNameLengthPairList( + SeqColExtendedDataEntity extendedNames, SeqColExtendedDataEntity extendedLengths) { + List nameLengthPairList = new ArrayList<>(); + for (int i=0; i constructSortedNameLengthPairs(List nameLengthPairList) { + SHA512ChecksumCalculator sha512ChecksumCalculator = new SHA512ChecksumCalculator(); + List sortedNameLengthPairs = new ArrayList<>(); + for (NameLengthPairEntity entity: nameLengthPairList) { + String nameLengthHash = sha512ChecksumCalculator.calculateChecksum(entity.toString()); + sortedNameLengthPairs.add(nameLengthHash); + } + // Sorting the name-length-pair list according to the elements' natural order (alphanumerically) + Comparator nameLengthComparator = new Comparator() { + @Override + public int compare(String o1, String o2) { + return o1.toLowerCase().compareTo(o2.toLowerCase()); + } + }; + sortedNameLengthPairs.sort(nameLengthComparator); + return sortedNameLengthPairs; + } + /** * Return the list of extended data entities that are the same across multiple seqCol objects under * the same assembly accession. These entities are "sequences", "md5Sequences" and "lengths". */ @@ -159,7 +220,8 @@ public static List constructSameValueExtendedSeqColDat /** * Return a list of seqCol sequences' names with all possible naming convention that can be extracted * from the given assemblyEntity*/ - public static List constructAllPossibleExtendedNamesSeqColData(AssemblyEntity assemblyEntity) { + public static List constructAllPossibleExtendedNamesSeqColData( + AssemblyEntity assemblyEntity) { List existingNamingConventions = new ArrayList<>(); if (assemblyEntity.getChromosomes().get(0).getEnaSequenceName() != null) { existingNamingConventions.add(SeqColEntity.NamingConvention.ENA); diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColLevelTwoEntity.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColLevelTwoEntity.java index bfa15ea..318638a 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColLevelTwoEntity.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/entities/SeqColLevelTwoEntity.java @@ -17,6 +17,8 @@ public class SeqColLevelTwoEntity extends SeqColEntity{ private List lengths; @JsonProperty("md5-sequences") private List md5DigestsOfSequences; + @JsonProperty("sorted-name-length-pairs") + private List sortedNameLengthPairs; public SeqColLevelTwoEntity setDigest(String digest) { this.digest = digest; diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java new file mode 100644 index 0000000..51ca942 --- /dev/null +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java @@ -0,0 +1,25 @@ +package uk.ac.ebi.eva.evaseqcol.model; + +import lombok.AllArgsConstructor; +import lombok.Data; +import lombok.NoArgsConstructor; + +@Data +@NoArgsConstructor +public class NameLengthPairEntity { + private String name; + private String length; + + public NameLengthPairEntity(String name, String length) { + this.name = name; + this.length = length; + } + + @Override + public String toString() { + return "{\n" + + " \"name\": \""+ name +"\",\n" + + " \"length\": \""+ length +"\",\n" + + "}"; + } +} diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java index ee9c1e4..3ad5fbc 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java @@ -79,6 +79,9 @@ public SeqColLevelOneEntity constructSeqColLevelOne(List extendedDataEntities = Arrays.asList( sequencesExtEntity, md5SequencesExtEntity, lengthsExtEntity, - namesExtEntity + namesExtEntity, + sortedNameLengthPairsExtEntity ); return constructSeqColLevelOne(extendedDataEntities, convention); } diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java index 999eb14..602adf1 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColService.java @@ -82,10 +82,16 @@ public Optional getSeqColByDigestAndLevel(String digest, String namesDigest = seqColLevelOne.get().getSeqColLevel1Object().getNames(); JSONExtData extendedNames = extendedDataService.getSeqColExtendedDataEntityByDigest(namesDigest).get().getExtendedSeqColData(); + // Retrieving sortedNameLengthPairs + String sortedNameLengthPairsDigest = seqColLevelOne.get().getSeqColLevel1Object().getSortedNameLengthPairs(); + JSONExtData extendedSortedNameLengthPairs = extendedDataService. + getSeqColExtendedDataEntityByDigest(sortedNameLengthPairsDigest).get().getExtendedSeqColData(); + levelTwoEntity.setSequences(extendedSequences.getObject()); levelTwoEntity.setMd5DigestsOfSequences(extendedMd5Sequnces.getObject()); levelTwoEntity.setLengths(extendedLengths.getObject()); levelTwoEntity.setNames(extendedNames.getObject()); + levelTwoEntity.setSortedNameLengthPairs(extendedSortedNameLengthPairs.getObject()); return Optional.of(levelTwoEntity); } else { @@ -128,7 +134,11 @@ public List fetchAndInsertAllSeqColByAssemblyAccession( List sameValueAttributeList = seqColDataMap.get().get("sameValueAttributes"); for (SeqColExtendedDataEntity extendedNamesEntity: possibleSequencesNamesList) { List seqColExtendedDataEntities = new ArrayList<>(sameValueAttributeList); + SeqColExtendedDataEntity extendedLengthsEntity = retrieveExtendedLengthEntity(seqColExtendedDataEntities); + SeqColExtendedDataEntity seqColSortedNameLengthPairEntity = SeqColExtendedDataEntity. + constructSeqColSortedNameLengthPairs(extendedNamesEntity, extendedLengthsEntity); seqColExtendedDataEntities.add(extendedNamesEntity); + seqColExtendedDataEntities.add(seqColSortedNameLengthPairEntity); SeqColLevelOneEntity levelOneEntity = levelOneService.constructSeqColLevelOne(seqColExtendedDataEntities, extendedNamesEntity.getNamingConvention()); Optional seqColDigest = insertSeqColL1AndL2(levelOneEntity, seqColExtendedDataEntities); if (seqColDigest.isPresent()) { @@ -142,6 +152,16 @@ public List fetchAndInsertAllSeqColByAssemblyAccession( return insertedSeqColDigests; } + /** + * Return the extended data entity that corresponds to the seqCol lengths attribute*/ + public SeqColExtendedDataEntity retrieveExtendedLengthEntity(List extendedDataEntities) { + for (SeqColExtendedDataEntity entity: extendedDataEntities) { + if (entity.getAttributeType() == SeqColExtendedDataEntity.AttributeType.lengths) { + return entity; + } + } + return null; + } @Transactional /** * Insert the given Level 1 seqCol entity and its corresponding extended level 2 data (names, lengths, sequences, ...) diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/utils/JSONLevelOne.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/utils/JSONLevelOne.java index 5c3a0b1..5b87b2d 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/utils/JSONLevelOne.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/utils/JSONLevelOne.java @@ -16,6 +16,8 @@ public class JSONLevelOne implements Serializable { private String md5DigestsOfSequences; private String names; private String lengths; + @JsonProperty("sorted-name-length-pairs") + private String sortedNameLengthPairs; public JSONLevelOne setSequences(String sequences) { this.sequences = sequences; From 0d7eace352264678cb1dede16294bd37604605cc Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Mon, 14 Aug 2023 00:15:34 +0100 Subject: [PATCH 2/5] test fix --- .../service/SeqColExtendedDataService.java | 18 +++++++++++++----- .../service/SeqColLevelTwoService.java | 3 +++ 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColExtendedDataService.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColExtendedDataService.java index 56058c8..27931cb 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColExtendedDataService.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColExtendedDataService.java @@ -92,14 +92,22 @@ public Optional getExtendedAttributeByDigest(String di } /** - * Return the 3 extended data objects (names, lengths, sequences and sequencesMD5) of the given naming convention*/ - public List constructExtendedSeqColDataList(AssemblyEntity assemblyEntity, AssemblySequenceEntity assemblySequenceEntity, - SeqColEntity.NamingConvention convention) throws IOException { + * Return the 5 seqCol extended data objects (names, lengths, sequences, sequencesMD5 and sorted-name-length-pair) + * of the given assembly and naming convention*/ + public List constructExtendedSeqColDataList( + AssemblyEntity assemblyEntity, AssemblySequenceEntity assemblySequenceEntity, + SeqColEntity.NamingConvention convention) { + SeqColExtendedDataEntity extendedLengthsEntity = SeqColExtendedDataEntity + .constructSeqColLengthsObject(assemblyEntity); + SeqColExtendedDataEntity extendedNamesEntity = SeqColExtendedDataEntity + .constructSeqColNamesObjectByNamingConvention(assemblyEntity, convention); + return Arrays.asList( SeqColExtendedDataEntity.constructSeqColSequencesObject(assemblySequenceEntity), SeqColExtendedDataEntity.constructSeqColSequencesMd5Object(assemblySequenceEntity), - SeqColExtendedDataEntity.constructSeqColNamesObjectByNamingConvention(assemblyEntity, convention), - SeqColExtendedDataEntity.constructSeqColLengthsObject(assemblyEntity) + extendedNamesEntity, + extendedLengthsEntity, + SeqColExtendedDataEntity.constructSeqColSortedNameLengthPairs(extendedNamesEntity, extendedLengthsEntity) ); } diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelTwoService.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelTwoService.java index 5982b12..55f1254 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelTwoService.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelTwoService.java @@ -106,6 +106,9 @@ public SeqColLevelTwoEntity constructSeqColL2(String level0Digest, List Date: Tue, 15 Aug 2023 21:03:53 +0100 Subject: [PATCH 3/5] upadte toString method --- .../uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java index 51ca942..ab6904a 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java @@ -17,9 +17,9 @@ public NameLengthPairEntity(String name, String length) { @Override public String toString() { - return "{\n" + - " \"name\": \""+ name +"\",\n" + - " \"length\": \""+ length +"\",\n" + + return "{" + + " \"name\": \""+ name +"\"," + + " \"length\": \""+ length +"\"," + "}"; } } From 03f323caa24c8e7549425b7706203818b345fad6 Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Thu, 17 Aug 2023 10:56:54 +0100 Subject: [PATCH 4/5] removed unnecessary tab space from the toString method --- .../uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java index ab6904a..1fec5da 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/model/NameLengthPairEntity.java @@ -18,8 +18,8 @@ public NameLengthPairEntity(String name, String length) { @Override public String toString() { return "{" + - " \"name\": \""+ name +"\"," + - " \"length\": \""+ length +"\"," + + " \"name\":\""+ name +"\"," + + " \"length\":\""+ length +"\"," + "}"; } } From 1438fb2bcf4cb4b589e7d1a3786ee83a78859d69 Mon Sep 17 00:00:00 2001 From: waterflow80 Date: Thu, 17 Aug 2023 11:08:55 +0100 Subject: [PATCH 5/5] change after rebase --- .../uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java index 3ad5fbc..c0c5d8d 100644 --- a/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java +++ b/src/main/java/uk/ac/ebi/eva/evaseqcol/service/SeqColLevelOneService.java @@ -99,7 +99,7 @@ public SeqColLevelOneEntity constructSeqColLevelOne( JSONExtData sequencesExtData = new JSONExtData(levelTwoEntity.getSequences()); JSONExtData lengthsExtData = new JSONExtData(levelTwoEntity.getLengths()); JSONExtData namesExtData = new JSONExtData(levelTwoEntity.getNames()); - JSONExtData md5SequencesExtData = new JSONExtData(levelTwoEntity.getMd5Sequences()); + JSONExtData md5SequencesExtData = new JSONExtData(levelTwoEntity.getMd5DigestsOfSequences()); JSONExtData sortedNameLengthPairsData = new JSONExtData(levelTwoEntity.getSortedNameLengthPairs()); // Sequences