diff --git a/apps/iatlas/data/requirements.txt b/apps/iatlas/data/requirements.txt index 5cb771ff2a..819e157284 100644 --- a/apps/iatlas/data/requirements.txt +++ b/apps/iatlas/data/requirements.txt @@ -1,24 +1,20 @@ -attrs==23.1.0 ; python_version >= "3.9" and python_version < "4.0" +attrs==23.2.0 ; python_version >= "3.9" and python_version < "4.0" backoff==2.2.1 ; python_version >= "3.9" and python_version < "4.0" -certifi==2023.11.17 ; python_version >= "3.9" and python_version < "4.0" -cffi==1.16.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" +certifi==2024.2.2 ; python_version >= "3.9" and python_version < "4.0" charset-normalizer==3.3.2 ; python_version >= "3.9" and python_version < "4.0" click==8.1.7 ; python_version >= "3.9" and python_version < "4.0" colorama==0.4.6 ; python_version >= "3.9" and python_version < "4.0" -cryptography==41.0.7 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" decorator==5.1.1 ; python_version >= "3.9" and python_version < "4.0" deprecated==1.2.14 ; python_version >= "3.9" and python_version < "4.0" deprecation==2.1.0 ; python_version >= "3.9" and python_version < "4.0" -googleapis-common-protos==1.61.0 ; python_version >= "3.9" and python_version < "4.0" -greenlet==3.0.1 ; python_version >= "3.9" and python_version < "4.0" and platform_machine == "aarch64" or python_version >= "3.9" and python_version < "4.0" and platform_machine == "ppc64le" or python_version >= "3.9" and python_version < "4.0" and platform_machine == "x86_64" or python_version >= "3.9" and python_version < "4.0" and platform_machine == "amd64" or python_version >= "3.9" and python_version < "4.0" and platform_machine == "AMD64" or python_version >= "3.9" and python_version < "4.0" and platform_machine == "win32" or python_version >= "3.9" and python_version < "4.0" and platform_machine == "WIN32" +googleapis-common-protos==1.62.0 ; python_version >= "3.9" and python_version < "4.0" +greenlet==3.0.3 ; python_version >= "3.9" and python_version < "4.0" and (platform_machine == "aarch64" or platform_machine == "ppc64le" or platform_machine == "x86_64" or platform_machine == "amd64" or platform_machine == "AMD64" or platform_machine == "win32" or platform_machine == "WIN32") idna==3.6 ; python_version >= "3.9" and python_version < "4.0" -importlib-metadata==6.8.0 ; python_version >= "3.9" and python_version < "4.0" +importlib-metadata==6.11.0 ; python_version >= "3.9" and python_version < "4.0" interrogate==1.5.0 ; python_version >= "3.9" and python_version < "4.0" -jeepney==0.8.0 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" -keyring==23.4.1 ; python_version >= "3.9" and python_version < "4.0" -keyrings-alt==3.1 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" +nest-asyncio==1.6.0 ; python_version >= "3.9" and python_version < "4.0" networkx==2.8.8 ; python_version >= "3.9" and python_version < "4.0" -numpy==1.26.2 ; python_version >= "3.9" and python_version <= "3.11" or python_version >= "3.12" and python_version < "4.0" +numpy==1.26.4 ; python_version >= "3.9" and python_version <= "3.11" or python_version >= "3.12" and python_version < "4.0" opentelemetry-api==1.21.0 ; python_version >= "3.9" and python_version < "4.0" opentelemetry-exporter-otlp-proto-common==1.21.0 ; python_version >= "3.9" and python_version < "4.0" opentelemetry-exporter-otlp-proto-http==1.21.0 ; python_version >= "3.9" and python_version < "4.0" @@ -26,28 +22,25 @@ opentelemetry-proto==1.21.0 ; python_version >= "3.9" and python_version < "4.0" opentelemetry-sdk==1.21.0 ; python_version >= "3.9" and python_version < "4.0" opentelemetry-semantic-conventions==0.42b0 ; python_version >= "3.9" and python_version < "4.0" packaging==23.2 ; python_version >= "3.9" and python_version < "4.0" -pandas==2.1.3 ; python_version >= "3.9" and python_version < "4.0" -protobuf==4.25.1 ; python_version >= "3.9" and python_version < "4.0" +pandas==2.2.1 ; python_version >= "3.9" and python_version < "4.0" +protobuf==4.25.3 ; python_version >= "3.9" and python_version < "4.0" psycopg2-binary==2.9.9 ; python_version >= "3.9" and python_version < "4.0" py==1.11.0 ; python_version >= "3.9" and python_version < "4.0" -pycparser==2.21 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" -pydantic==1.10.13 ; python_version >= "3.9" and python_version < "4.0" +pydantic==1.10.14 ; python_version >= "3.9" and python_version < "4.0" python-dateutil==2.8.2 ; python_version >= "3.9" and python_version < "4.0" -pytz==2023.3.post1 ; python_version >= "3.9" and python_version < "4.0" -pywin32-ctypes==0.2.2 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "win32" +pytz==2024.1 ; python_version >= "3.9" and python_version < "4.0" pyyaml==6.0.1 ; python_version >= "3.9" and python_version < "4.0" requests==2.31.0 ; python_version >= "3.9" and python_version < "4.0" -schematic-db[postgres,synapse]==0.0.36 ; python_version >= "3.9" and python_version < "4.0" -secretstorage==3.3.3 ; python_version >= "3.9" and python_version < "4.0" and sys_platform == "linux" +schematic-db[postgres,synapse]==0.0.41 ; python_version >= "3.9" and python_version < "4.0" six==1.16.0 ; python_version >= "3.9" and python_version < "4.0" sqlalchemy-utils==0.41.1 ; python_version >= "3.9" and python_version < "4.0" -sqlalchemy==2.0.23 ; python_version >= "3.9" and python_version < "4.0" -synapseclient==3.2.0 ; python_version >= "3.9" and python_version < "4.0" +sqlalchemy==2.0.27 ; python_version >= "3.9" and python_version < "4.0" +synapseclient==4.1.1 ; python_version >= "3.9" and python_version < "4.0" tabulate==0.9.0 ; python_version >= "3.9" and python_version < "4.0" tenacity==8.2.3 ; python_version >= "3.9" and python_version < "4.0" toml==0.10.2 ; python_version >= "3.9" and python_version < "4.0" -typing-extensions==4.8.0 ; python_version >= "3.9" and python_version < "4.0" -tzdata==2023.3 ; python_version >= "3.9" and python_version < "4.0" +typing-extensions==4.10.0 ; python_version >= "3.9" and python_version < "4.0" +tzdata==2024.1 ; python_version >= "3.9" and python_version < "4.0" urllib3==1.26.18 ; python_version >= "3.9" and python_version < "4.0" validators==0.20.0 ; python_version >= "3.9" and python_version < "4.0" wrapt==1.16.0 ; python_version >= "3.9" and python_version < "4.0" diff --git a/apps/iatlas/data/src/build_database.py b/apps/iatlas/data/src/build_database.py index db4072e249..6cb7a7bfca 100755 --- a/apps/iatlas/data/src/build_database.py +++ b/apps/iatlas/data/src/build_database.py @@ -36,6 +36,185 @@ raise ValueError(f"{var} is None") iatlas_config = [ + { + "name": "cells", + "primary_key": "id", + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "name", + "datatype": "str", + "required": True, + "index": False, + }, + { + "column_name": "cell_type", + "datatype": "str", + "required": True, + "index": False, + }, + ], + }, + { + "name": "cells_to_samples", + "primary_key": "id", + "foreign_keys": [ + { + "column_name": "cell_id", + "foreign_table_name": "cells", + "foreign_column_name": "id", + }, + { + "column_name": "sample_id", + "foreign_table_name": "samples", + "foreign_column_name": "id", + }, + ], + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "cell_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "sample_id", + "datatype": "str", + "required": False, + "index": True, + }, + ], + }, + { + "name": "cells_to_genes", + "primary_key": "id", + "foreign_keys": [ + { + "column_name": "cell_id", + "foreign_table_name": "cells", + "foreign_column_name": "id", + }, + { + "column_name": "gene_id", + "foreign_table_name": "genes", + "foreign_column_name": "id", + }, + ], + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "cell_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "gene_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "single_cell_seq", + "datatype": "float", + "required": True, + "index": False, + }, + ], + }, + { + "name": "cells_to_features", + "primary_key": "id", + "foreign_keys": [ + { + "column_name": "cell_id", + "foreign_table_name": "cells", + "foreign_column_name": "id", + }, + { + "column_name": "feature_id", + "foreign_table_name": "features", + "foreign_column_name": "id", + }, + ], + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "cell_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "feature_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "feature_value", + "datatype": "float", + "required": True, + "index": False, + }, + ], + }, + { + "name": "cell_stats", + "primary_key": "id", + "foreign_keys": [ + { + "column_name": "dataset_id", + "foreign_table_name": "datasets", + "foreign_column_name": "id", + }, + { + "column_name": "gene_id", + "foreign_table_name": "genes", + "foreign_column_name": "id", + }, + ], + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "dataset_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "gene_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "cell_type", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "cell_count", + "datatype": "int", + "required": False, + "index": False, + }, + { + "column_name": "avg_expr", + "datatype": "float", + "required": False, + "index": False, + }, + { + "column_name": "perc_expr", + "datatype": "float", + "required": False, + "index": False, + }, + ], + }, { "name": "cohorts", "primary_key": "id", @@ -1563,6 +1742,92 @@ }, ], }, + { + "name": "single_cell_pseudobulk", + "primary_key": "id", + "foreign_keys": [ + { + "column_name": "gene_id", + "foreign_table_name": "genes", + "foreign_column_name": "id", + }, + { + "column_name": "sample_id", + "foreign_table_name": "samples", + "foreign_column_name": "id", + }, + ], + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "sample_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "cell_type", + "datatype": "str", + "required": True, + "index": False, + }, + { + "column_name": "gene_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "single_cell_seq_sum", + "datatype": "float", + "required": True, + "index": False, + }, + ], + }, + { + "name": "single_cell_pseudobulk_features", + "primary_key": "id", + "foreign_keys": [ + { + "column_name": "sample_id", + "foreign_table_name": "samples", + "foreign_column_name": "id", + }, + { + "column_name": "feature_id", + "foreign_table_name": "features", + "foreign_column_name": "id", + }, + ], + "columns": [ + {"column_name": "id", "datatype": "str", "required": True, "index": True}, + { + "column_name": "sample_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "cell_type", + "datatype": "str", + "required": True, + "index": False, + }, + { + "column_name": "feature_id", + "datatype": "str", + "required": True, + "index": True, + }, + { + "column_name": "value", + "datatype": "float", + "required": True, + "index": False, + }, + ], + }, { "name": "snps", "primary_key": "id", @@ -1596,7 +1861,9 @@ ) schema = Schema( - SchemaConfig(schema_url=getenv("SCHEMA_URL")), DatabaseConfig(iatlas_config) + SchemaConfig(schema_url=getenv("SCHEMA_URL")), + DatabaseConfig(iatlas_config), + display_label_type="display_label", ) @@ -1609,109 +1876,110 @@ synapse_project_id=getenv("SCHEMA_PROJECT_ID"), synapse_asset_view_id=getenv("SCHEMA_ASSET_VIEW_ID"), synapse_auth_token=getenv("SCHEMA_AUTH_TOKEN"), - ) + ), + display_label_type="display_label", ) -updater = RDBUpdater(db, ms) -updater.update_database(method="insert") +# updater = RDBUpdater(db, ms) +# updater.update_database(method="insert") -def insert_as_chunk(df, db, table_name, chunk_size=20000): - num_chunks = ((len(df) - 1) // chunk_size) + 1 +# def insert_as_chunk(df, db, table_name, chunk_size=20000): +# num_chunks = ((len(df) - 1) // chunk_size) + 1 - for i in range(num_chunks): - # Getting chunk boundaries - start_index = i * chunk_size - end_index = (i + 1) * chunk_size +# for i in range(num_chunks): +# # Getting chunk boundaries +# start_index = i * chunk_size +# end_index = (i + 1) * chunk_size - # Selecting rows for the current chunk - chunk = df[start_index:end_index] +# # Selecting rows for the current chunk +# chunk = df[start_index:end_index] - # Inserting rows into the table - db.insert_table_rows(table_name, chunk) +# # Inserting rows into the table +# db.insert_table_rows(table_name, chunk) -cohorts_to_samples1 = db.execute_sql_query( - ( - "SELECT DISTINCT c.id AS cohort_id, t.id AS cohorts_to_samples_tag_id, stt.sample_id " - "FROM samples s " - "INNER JOIN samples_to_tags stt ON s.id = stt.sample_id " - "INNER JOIN tags t ON stt.tag_id = t.id " - "INNER JOIN tags_to_tags ttt ON t.id = ttt.tag_id " - "INNER JOIN tags t2 ON ttt.related_tag_id = t2.id " - "INNER JOIN datasets_to_samples dts on s.id = dts.sample_id " - "INNER JOIN cohorts c on t2.id = c.cohort_tag_id AND dts.dataset_id = c.dataset_id " - "WHERE t.tag_type = 'group'" - ) -) -cohorts_to_samples1["id"] = [ - uuid.uuid1() for _ in range(len(cohorts_to_samples1.index)) -] -insert_as_chunk(cohorts_to_samples1, db, "cohorts_to_samples") -# db.insert_table_rows("cohorts_to_samples", cohorts_to_samples1) +# cohorts_to_samples1 = db.execute_sql_query( +# ( +# "SELECT DISTINCT c.id AS cohort_id, t.id AS cohorts_to_samples_tag_id, stt.sample_id " +# "FROM samples s " +# "INNER JOIN samples_to_tags stt ON s.id = stt.sample_id " +# "INNER JOIN tags t ON stt.tag_id = t.id " +# "INNER JOIN tags_to_tags ttt ON t.id = ttt.tag_id " +# "INNER JOIN tags t2 ON ttt.related_tag_id = t2.id " +# "INNER JOIN datasets_to_samples dts on s.id = dts.sample_id " +# "INNER JOIN cohorts c on t2.id = c.cohort_tag_id AND dts.dataset_id = c.dataset_id " +# "WHERE t.tag_type = 'group'" +# ) +# ) +# cohorts_to_samples1["id"] = [ +# uuid.uuid1() for _ in range(len(cohorts_to_samples1.index)) +# ] +# insert_as_chunk(cohorts_to_samples1, db, "cohorts_to_samples") +# # db.insert_table_rows("cohorts_to_samples", cohorts_to_samples1) -cohorts_to_samples2 = db.execute_sql_query( - ( - "SELECT DISTINCT c.id AS cohort_id, dts.sample_id " - "FROM cohorts c " - "INNER JOIN datasets_to_samples dts ON c.dataset_id = dts.dataset_id " - "WHERE c.cohort_tag_id is NULL" - ) -) -cohorts_to_samples2["id"] = [ - uuid.uuid1() for _ in range(len(cohorts_to_samples2.index)) -] -insert_as_chunk(cohorts_to_samples2, db, "cohorts_to_samples") -# db.insert_table_rows("cohorts_to_samples", cohorts_to_samples2) +# cohorts_to_samples2 = db.execute_sql_query( +# ( +# "SELECT DISTINCT c.id AS cohort_id, dts.sample_id " +# "FROM cohorts c " +# "INNER JOIN datasets_to_samples dts ON c.dataset_id = dts.dataset_id " +# "WHERE c.cohort_tag_id is NULL" +# ) +# ) +# cohorts_to_samples2["id"] = [ +# uuid.uuid1() for _ in range(len(cohorts_to_samples2.index)) +# ] +# insert_as_chunk(cohorts_to_samples2, db, "cohorts_to_samples") +# # db.insert_table_rows("cohorts_to_samples", cohorts_to_samples2) -cohorts_to_features = db.execute_sql_query( - ( - "SELECT DISTINCT cohort_id, feature_id " - "FROM cohorts_to_samples cts " - "INNER JOIN features_to_samples fts USING(sample_id)" - ) -) -cohorts_to_features["id"] = [ - uuid.uuid1() for _ in range(len(cohorts_to_features.index)) -] -insert_as_chunk(cohorts_to_features, db, "cohorts_to_features") -# db.insert_table_rows("cohorts_to_features", cohorts_to_features) +# cohorts_to_features = db.execute_sql_query( +# ( +# "SELECT DISTINCT cohort_id, feature_id " +# "FROM cohorts_to_samples cts " +# "INNER JOIN features_to_samples fts USING(sample_id)" +# ) +# ) +# cohorts_to_features["id"] = [ +# uuid.uuid1() for _ in range(len(cohorts_to_features.index)) +# ] +# insert_as_chunk(cohorts_to_features, db, "cohorts_to_features") +# # db.insert_table_rows("cohorts_to_features", cohorts_to_features) -cohorts_to_genes = db.execute_sql_query( - ( - "SELECT DISTINCT cohort_id, gene_id " - "FROM cohorts_to_samples cts " - "INNER JOIN genes_to_samples gts USING(sample_id)" - ) -) -cohorts_to_genes["id"] = [uuid.uuid1() for _ in range(len(cohorts_to_genes.index))] -insert_as_chunk(cohorts_to_genes, db, "cohorts_to_genes") -# db.insert_table_rows("cohorts_to_genes", cohorts_to_genes) +# cohorts_to_genes = db.execute_sql_query( +# ( +# "SELECT DISTINCT cohort_id, gene_id " +# "FROM cohorts_to_samples cts " +# "INNER JOIN genes_to_samples gts USING(sample_id)" +# ) +# ) +# cohorts_to_genes["id"] = [uuid.uuid1() for _ in range(len(cohorts_to_genes.index))] +# insert_as_chunk(cohorts_to_genes, db, "cohorts_to_genes") +# # db.insert_table_rows("cohorts_to_genes", cohorts_to_genes) -cohorts_to_mutations = db.execute_sql_query( - ( - "SELECT DISTINCT cohort_id, mutation_id " - "FROM cohorts_to_samples cts " - "INNER JOIN samples_to_mutations stm USING(sample_id)" - ) -) -cohorts_to_mutations["id"] = [ - uuid.uuid1() for _ in range(len(cohorts_to_mutations.index)) -] -insert_as_chunk(cohorts_to_mutations, db, "cohorts_to_mutations") -# db.insert_table_rows("cohorts_to_mutations", cohorts_to_mutations) +# cohorts_to_mutations = db.execute_sql_query( +# ( +# "SELECT DISTINCT cohort_id, mutation_id " +# "FROM cohorts_to_samples cts " +# "INNER JOIN samples_to_mutations stm USING(sample_id)" +# ) +# ) +# cohorts_to_mutations["id"] = [ +# uuid.uuid1() for _ in range(len(cohorts_to_mutations.index)) +# ] +# insert_as_chunk(cohorts_to_mutations, db, "cohorts_to_mutations") +# # db.insert_table_rows("cohorts_to_mutations", cohorts_to_mutations) -cohorts_to_tags = db.execute_sql_query( - ( - "SELECT DISTINCT cohort_id, tag_id " - "FROM cohorts_to_samples cts " - "INNER JOIN samples_to_tags sts USING(sample_id)" - ) -) -cohorts_to_tags["id"] = [uuid.uuid1() for _ in range(len(cohorts_to_tags.index))] -insert_as_chunk(cohorts_to_tags, db, "cohorts_to_tags") -# db.insert_table_rows("cohorts_to_tags", cohorts_to_tags) +# cohorts_to_tags = db.execute_sql_query( +# ( +# "SELECT DISTINCT cohort_id, tag_id " +# "FROM cohorts_to_samples cts " +# "INNER JOIN samples_to_tags sts USING(sample_id)" +# ) +# ) +# cohorts_to_tags["id"] = [uuid.uuid1() for _ in range(len(cohorts_to_tags.index))] +# insert_as_chunk(cohorts_to_tags, db, "cohorts_to_tags") +# # db.insert_table_rows("cohorts_to_tags", cohorts_to_tags)