Skip to content
This repository has been archived by the owner on Jun 20, 2024. It is now read-only.

Feature/biohackathon #327

Closed
wants to merge 13 commits into from
3 changes: 3 additions & 0 deletions api/api.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,10 @@ func Setup() *http.Server {
router.GET("/files/:fileid", SelectedMiddleware(), sda.Download)
router.GET("/s3/*path", SelectedMiddleware(), s3.Download)
router.HEAD("/s3/*path", SelectedMiddleware(), s3.Download)
router.GET("/s3-encrypted/*path", SelectedMiddleware(), s3.Download)
router.GET("/health", healthResponse)
router.GET("/header/*path", SelectedMiddleware(), s3.Download)
router.GET("/c4gheadsize/*path", SelectedMiddleware(), s3.Download)

// Configure TLS settings
log.Info("(3/5) Configuring TLS")
Expand Down
60 changes: 52 additions & 8 deletions api/s3/s3.go
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,22 @@ func ListObjects(c *gin.Context) {
})
}

func getFileInfo(c *gin.Context) (fileInfo *database.FileInfo, err error) {
// Get file info for the given file path (or abort)
fileInfo, err = database.GetDatasetFileInfo(c.Param("dataset"), c.Param("filename")+".c4gh")
if err != nil {
if err.Error() == "sql: no rows in result set" {
c.AbortWithStatus(http.StatusNotFound)
} else {
c.AbortWithStatus(http.StatusInternalServerError)
}

return
}

return fileInfo, nil
}

// GetObject respondes to an S3 GetObject request. This request returns S3
// objects. This is done by first fetching any file that matches the dataset +
// filename request from the database and then passing the fileID to the
Expand All @@ -196,15 +212,36 @@ func ListObjects(c *gin.Context) {
func GetObject(c *gin.Context) {
log.Debugf("S3 GetObject request, context: %v", c.Params)

// Get file info for the given file path (or abort)
fileInfo, err := database.GetDatasetFileInfo(c.Param("dataset"), c.Param("filename")+".c4gh")
fileInfo, err := getFileInfo(c)
if err != nil {
if err.Error() == "sql: no rows in result set" {
c.AbortWithStatus(http.StatusNotFound)
} else {
c.AbortWithStatus(http.StatusInternalServerError)
}
return
}

// Set a param so that Download knows to add S3 headers
c.Set("S3", true)

// set the fileID so that download knows what file to download
c.Params = append(c.Params, gin.Param{Key: "fileid", Value: fileInfo.FileID})
if strings.Contains(c.Request.URL.String(), "header") {
c.Params = append(c.Params, gin.Param{Key: "type", Value: "header"})
} else if strings.Contains(c.Request.URL.String(), "c4gheadsize") {
c.Params = append(c.Params, gin.Param{Key: "type", Value: "headersize"})
}

// Download the file
sda.Download(c)
}

// GetEncryptedObject respondes to an S3 GetObject request for encrypted files.
// This request returns S3 objects. This is done by first fetching any file that matches the dataset +
// filename request from the database and then passing the fileID to the
// SDA Download function.
// https://docs.aws.amazon.com/AmazonS3/latest/API/API_GetObject.html
func GetEcnryptedObject(c *gin.Context) {
log.Debugf("S3 GetEncryptedObject request, context: %v", c.Params)

fileInfo, err := getFileInfo(c)
if err != nil {
return
}

Expand All @@ -214,6 +251,9 @@ func GetObject(c *gin.Context) {
// set the fileID so that download knows what file to download
c.Params = append(c.Params, gin.Param{Key: "fileid", Value: fileInfo.FileID})

// set the encrypted parameter so that download gets the encrypted file instead
c.Params = append(c.Params, gin.Param{Key: "type", Value: "encrypted"})

// Download the file
sda.Download(c)
}
Expand Down Expand Up @@ -294,7 +334,11 @@ func Download(c *gin.Context) {
ListBuckets(c)

case c.Param("filename") != "":
GetObject(c)
if strings.Contains(c.Request.URL.String(), "encrypted") {
GetEcnryptedObject(c)
} else {
GetObject(c)
}

default:
log.Warningf("Got unknown S3 request: %v", c.Request)
Expand Down
74 changes: 68 additions & 6 deletions api/sda/sda.go
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ import (
"strings"
"time"

"github.com/biogo/hts/bam"
"github.com/gin-gonic/gin"
"github.com/neicnordic/crypt4gh/model/headers"
"github.com/neicnordic/crypt4gh/streaming"
Expand Down Expand Up @@ -198,18 +199,79 @@ func Download(c *gin.Context) {
return
}

// Stitch file and prepare it for streaming
fileStream, err := stitchFile(fileDetails.Header, file, coordinates)
if err != nil {
log.Errorf("could not prepare file for streaming, %s", err)
c.String(http.StatusInternalServerError, "file stream error")
var fileStream io.Reader
switch c.Param("type") {
case "encrypted":
log.Print("Return encrypted file")
fileStream, err = stitchEncryptedFile(fileDetails.Header, file, coordinates)
if err != nil {
log.Errorf("could not prepare file for streaming, %s", err)
c.String(http.StatusInternalServerError, "file stream error")

return
return
}
c.Header("Content-Length", "")
default:
// Stitch file and prepare it for streaming
fileStream, err = stitchFile(fileDetails.Header, file, coordinates)
if err != nil {
log.Errorf("could not prepare file for streaming, %s", err)
c.String(http.StatusInternalServerError, "file stream error")

return
}
}
log.Debug("- - - - - - - test 1 - - - - - - - ")
if c.Param("type") == "header" {
bamReader, err := bam.NewReader(fileStream, 0)
if err != nil {
log.Fatalln(err)
}
defer bamReader.Close()
reader := bamReader
h := reader.Header()
var r io.Reader
buf := new(bytes.Buffer)
fmt.Fprint(buf, h)
r = buf
sendStream(c.Writer, r)

return

} else if c.Param("type") == "headersize" {
log.Debug("- - - - - - - test 2 - - - - - - - ")
head := fileDetails.Header
log.Debug("head: ", head)
//headlength := len(head)
headlength := bytes.NewReader(head)
log.Debug("headlength size: ", headlength.Size())
buf := new(bytes.Buffer)
len := strconv.Itoa(int(headlength.Size()))
fmt.Fprint(buf, len)
sendStream(c.Writer, buf)

return
}
log.Debug("- - - - - - - test 3 - - - - - - - ")
sendStream(c.Writer, fileStream)
}

// stitchFile stitches the header and file body together for Crypt4GHReader
// and returns a streamable Reader
var stitchEncryptedFile = func(header []byte, file io.ReadCloser, coordinates *headers.DataEditListHeaderPacket) (io.Reader, error) {
log.Debugf("stitching header to file %s for streaming", file)
// Stitch header and file body together
hr := bytes.NewReader(header)

encryptedFile := io.MultiReader(hr, io.MultiReader(hr, file))

log.Print("Encrypted file:", encryptedFile)

log.Debugf("file stream for %s constructed", file)

return encryptedFile, nil
}

// stitchFile stitches the header and file body together for Crypt4GHReader
// and returns a streamable Reader
var stitchFile = func(header []byte, file io.ReadCloser, coordinates *headers.DataEditListHeaderPacket) (*streaming.Crypt4GHReader, error) {
Expand Down
98 changes: 98 additions & 0 deletions biohackathon/paper.bib
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
@misc{Tryggve1,
title={NeIC {T}ryggve 1},
url={https://neic.no/tryggve1/},
author={NeIC},
publisher={Nordic e-Infrastructure Collaboration},
year={2014-2017},
note={Accessed: 2023-03-29}
}

@misc{NEIC,
title={NeIC},
url={https://neic.no/},
author={NeIC},
publisher={Nordic e-Infrastructure Collaboration},
note={Accessed: 2023-03-29}
}

@misc{Tryggve2,
title={NeIC {T}ryggve 2},
url={https://neic.no/tryggve2/},
author={NeIC},
year={2017-2020},
publisher={Nordic e-Infrastructure Collaboration},
note={Accessed: 2023-03-29}
}

@misc{Heilsa,
title={NeIC {H}eilsa {T}ryggvedottir},
url={https://neic.no/heilsa/},
author={NeIC},
year={2021-2024},
publisher={Nordic e-Infrastructure Collaboration},
note={Accessed: 2023-03-29}
}

@article{EGA,
Title={The European Genome-phenome Archive in 2021},
Author={Freeberg, Mallory Ann and Fromont, Lauren A and D'Altri, Teresa and Romero, Anna Foix and Ciges, Jorge Izquierdo and Jene, Aina and Kerry, Giselle and Moldes, Mauricio and Ariosa, Roberto and Bahena, Silvia and Barrowdale, Daniel and Barbero, Marcos Casado and Fernandez-Orth, Dietmar and Garcia-Linares, Carles and Garcia-Rios, Emilio and Haziza, Frédéric and Juhasz, Bela and Llobet, Oscar Martinez and Milla, Gemma and Mohan, Anand and Rueda, Manuel and Sankar, Aravind and Shaju, Dona and Shimpi, Ashutosh and Singh, Babita and Thomas, Coline and de la Torre, Sabela and Uyan, Umuthan and Vasallo, Claudia and Flicek, Paul and Guigo, Roderic and Navarro, Arcadi and Parkinson, Helen and Keane, Thomas and Rambla, Jordi},
DOI={10.1093/nar/gkab1059},
Number={D1},
Volume={50},
Month={January},
Year={2022},
Journal={Nucleic acids research},
ISSN={0305-1048},
Pages={D980—D987},
Abstract={The European Genome-phenome Archive (EGA - https://ega-archive.org/) is a resource for long term secure archiving of all types of potentially identifiable genetic, phenotypic, and clinical data resulting from biomedical research projects. Its mission is to foster hosted data reuse, enable reproducibility, and accelerate biomedical and translational research in line with the FAIR principles. Launched in 2008, the EGA has grown quickly, currently archiving over 4,500 studies from nearly one thousand institutions. The EGA operates a distributed data access model in which requests are made to the data controller, not to the EGA, therefore, the submitter keeps control on who has access to the data and under which conditions. Given the size and value of data hosted, the EGA is constantly improving its value chain, that is, how the EGA can contribute to enhancing the value of human health data by facilitating its submission, discovery, access, and distribution, as well as leading the design and implementation of standards and methods necessary to deliver the value chain. The EGA has become a key GA4GH Driver Project, leading multiple development efforts and implementing new standards and tools, and has been appointed as an ELIXIR Core Data Resource.},
URL={https://europepmc.org/articles/PMC8728218},
}

@misc{GDI,
title={Genomic Data Infrastructure},
url={https://gdi.onemilliongenomes.eu/},
author={GDI\\ Consortium},
publisher={GDI},
note={Accessed: 2023-11-02}
}

@article{crypt4gh,
author = {Senf, Alexander and Davies, Robert and Haziza, Frédéric and Marshall, John and Troncoso-Pastoriza, Juan and Hofmann, Oliver and Keane, Thomas M.},
title = "{Crypt4GH: a file format standard enabling native access to encrypted data}",
journal = {Bioinformatics},
volume = {37},
number = {17},
pages = {2753-2754},
year = {2021},
month = {02},
abstract = "{The majority of genome analysis tools and pipelines require data to be decrypted for access. This potentially leaves sensitive genetic data exposed, either because the unencrypted data is not removed after analysis, or because the data leaves traces on the permanent storage medium.: We defined a file container specification enabling direct byte-level compatible random access to encrypted genetic data stored in community standards such as SAM/BAM/CRAM/VCF/BCF. By standardizing this format, we show how it can be added as a native file format to genomic libraries, enabling direct analysis of encrypted data without the need to create a decrypted copy.The Crypt4GH specification can be found at: http://samtools.github.io/hts-specs/crypt4gh.pdf.Supplementary data are available at Bioinformatics online.}",
issn = {1367-4803},
doi = {10.1093/bioinformatics/btab087},
url = {https://doi.org/10.1093/bioinformatics/btab087},
eprint = {https://academic.oup.com/bioinformatics/article-pdf/37/17/2753/50339113/btab087.pdf},
}

@article{htsget,
author = {Kelleher, Jerome and Lin, Mike and Albach, C H and Birney, Ewan and Davies, Robert and Gourtovaia, Marina and Glazer, David and Gonzalez, Cristina Y and Jackson, David K and Kemp, Aaron and Marshall, John and Nowak, Andrew and Senf, Alexander and Tovar-Corona, Jaime M and Vikhorev, Alexander and Keane, Thomas M and GA4GH Streaming Task Team },
title = "{htsget: a protocol for securely streaming genomic data}",
journal = {Bioinformatics},
volume = {35},
number = {1},
pages = {119-121},
year = {2018},
month = {06},
abstract = "{Standardized interfaces for efficiently accessing high-throughput sequencing data are a fundamental requirement for large-scale genomic data sharing. We have developed htsget, a protocol for secure, efficient and reliable access to sequencing read and variation data. We demonstrate four independent client and server implementations, and the results of a comprehensive interoperability demonstration.http://samtools.github.io/hts-specs/htsget.htmlSupplementary data are available at Bioinformatics online.}",
issn = {1367-4803},
doi = {10.1093/bioinformatics/bty492},
url = {https://doi.org/10.1093/bioinformatics/bty492},
eprint = {https://academic.oup.com/bioinformatics/article-pdf/35/1/119/48962810/bioinformatics\_35\_1\_119.pdf},
}


@misc{htsget-rs,
title={htsget rust implementation},
url={https://github.com/umccr/htsget-rs},
author={htsget-rs team},
publisher={htsget-rs team},
note={Accessed: 2023-11-02}
}
Loading