Skip to content

Commit

Permalink
Feature typehints (#10)
Browse files Browse the repository at this point in the history
* Allows X to be empty. Fixes #7

* Add typehints and documentation

* Fix uniontype

* nan is a float
  • Loading branch information
LouiseDck authored Nov 13, 2024
1 parent 2cf0931 commit 6187fcc
Show file tree
Hide file tree
Showing 5 changed files with 76 additions and 67 deletions.
9 changes: 6 additions & 3 deletions src/dummy_anndata/generate_dataframe.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,21 @@
import pandas as pd

from .generate_vector import vector_generators


def generate_dataframe(n_rows, types=None):
def generate_dataframe(n_rows: int, types: list[str] | None = None) -> pd.DataFrame:
"""
Generate a pandas DataFrame with specified number of rows and column types.
Parameters:
Parameters
----------
n_rows (int): The number of rows in the DataFrame.
types (list, optional): A list of column types to include in the DataFrame.
Choose from the list of vector_generators keys.
If not provided, all available column types will be included.
Returns:
Returns
-------
pandas.DataFrame: The generated DataFrame.
"""
Expand Down
31 changes: 16 additions & 15 deletions src/dummy_anndata/generate_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,19 +9,19 @@


def generate_dataset(
n_obs=10,
n_vars=20,
x_type=None,
layer_types=None,
obs_types=None,
var_types=None,
obsm_types=None,
varm_types=None,
obsp_types=None,
varp_types=None,
uns_types=None,
nested_uns_types=None,
):
n_obs: int = 10,
n_vars: int = 20,
x_type: str | None = None,
layer_types: list[str] | None = None,
obs_types: list[str] | None = None,
var_types: list[str] | None = None,
obsm_types: list[str] | None = None,
varm_types: list[str] | None = None,
obsp_types: list[str] | None = None,
varp_types: list[str] | None = None,
uns_types: list[str] | None = None,
nested_uns_types: list[str] | None = None,
) -> ad.AnnData:
"""
Generate a synthetic AnnData dataset with specified dimensions and data types.
Expand Down Expand Up @@ -105,7 +105,7 @@ def generate_dataset(
"nullable_boolean_array",
]
)
obsm_types = set(matrix_generators.keys()) - vector_not_allowed
obsm_types = list(set(matrix_generators.keys()) - vector_not_allowed)
if varm_types is None: # varm_types are all matrices or vectors, except for categoricals and nullables
vector_not_allowed = set(
[
Expand All @@ -117,7 +117,8 @@ def generate_dataset(
"nullable_boolean_array",
]
)
varm_types = set(matrix_generators.keys()) - vector_not_allowed
varm_types = list(set(matrix_generators.keys()) - vector_not_allowed)

if obsp_types is None: # obsp_types are all matrices
obsp_types = list(matrix_generators.keys())
if varp_types is None: # varp_types are all matrices
Expand Down
27 changes: 23 additions & 4 deletions src/dummy_anndata/generate_dict.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import numpy as np

from .generate_matrix import matrix_generators
from .generate_vector import vector_generators
from typing import Union

from .generate_matrix import matrix_generators, generated_matrix_types
from .generate_vector import vector_generators, generated_vector_types

scalar_generators = {
"string": "version",
Expand All @@ -14,6 +16,7 @@
"nan": np.nan,
}

generated_scalar_types = Union[str, int, float, bool, None, np.float64]

def generate_scalar(scalar_type):
if scalar_type[:7] == "scalar_":
Expand All @@ -30,8 +33,24 @@ def generate_type(type, n_rows, n_cols):
return matrix_generators[type](n_rows, n_cols)
return None

all_types = generated_scalar_types | generated_vector_types | generated_matrix_types
generated_dict_types = dict[str, all_types | dict[str, all_types]]

def generate_dict(
n_rows: int, n_cols: int, types: list[str] | None = None, nested_uns_types: list[str] | None = None
) -> generated_dict_types:
"""
Generates a dictionary with specified types of data.
Parameters:
n_rows (int): Number of rows for the generated data.
n_cols (int): Number of columns for the generated data.
types (list[str] | None): List of types to generate. If None, defaults to all available types.
nested_uns_types (list[str] | None): List of types for nested 'uns' data. If None, defaults to all available types.
def generate_dict(n_rows, n_cols, types=None, nested_uns_types=None):
Returns:
A dictionary containing the generated data.
"""
if types is None: # types are all vectors and all matrices
types = (
list(scalar_generators.keys())
Expand All @@ -52,6 +71,6 @@ def generate_dict(n_rows, n_cols, types=None, nested_uns_types=None):
if types: # types is not empty
data = {t: generate_type(t, n_rows, n_cols) for t in types}
if nested_uns_types:
data["nested"] = generate_dict(n_rows, n_cols, types = nested_uns_types, nested_uns_types=[])
data["nested"] = generate_dict(n_rows, n_cols, types=nested_uns_types, nested_uns_types=[])

return data
43 changes: 17 additions & 26 deletions src/dummy_anndata/generate_matrix.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,44 +19,35 @@ def int_mtx(n_obs, n_vars):
# integer matrices do not support NAs in Python
matrix_generators = {
"float_matrix": lambda n_obs, n_vars: float_mtx(n_obs, n_vars),
"float_matrix_nas": lambda n_obs, n_vars: float_mtx(
n_obs, n_vars, NAs=True
),
"float_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(
float_mtx(n_obs, n_vars)
),
"float_csparse_nas": lambda n_obs, n_vars: sp.sparse.csc_matrix(
float_mtx(n_obs, n_vars, NAs=True)
),
"float_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(
float_mtx(n_obs, n_vars)
),
"float_rsparse_nas": lambda n_obs, n_vars: sp.sparse.csr_matrix(
float_mtx(n_obs, n_vars, NAs=True)
),
"float_matrix_nas": lambda n_obs, n_vars: float_mtx(n_obs, n_vars, NAs=True),
"float_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(float_mtx(n_obs, n_vars)),
"float_csparse_nas": lambda n_obs, n_vars: sp.sparse.csc_matrix(float_mtx(n_obs, n_vars, NAs=True)),
"float_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(float_mtx(n_obs, n_vars)),
"float_rsparse_nas": lambda n_obs, n_vars: sp.sparse.csr_matrix(float_mtx(n_obs, n_vars, NAs=True)),
"integer_matrix": lambda n_obs, n_vars: int_mtx(n_obs, n_vars),
"integer_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(
int_mtx(n_obs, n_vars)
),
"integer_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(
int_mtx(n_obs, n_vars)
),
"integer_csparse": lambda n_obs, n_vars: sp.sparse.csc_matrix(int_mtx(n_obs, n_vars)),
"integer_rsparse": lambda n_obs, n_vars: sp.sparse.csr_matrix(int_mtx(n_obs, n_vars)),
}

generated_matrix_types = np.ndarray | sp.sparse.csc_matrix | sp.sparse.csr_matrix

def generate_matrix(n_obs, n_vars, matrix_type):
def generate_matrix(n_obs: int, n_vars: int, matrix_type: str) -> generated_matrix_types:
"""
Generate a matrix of given dimensions and type.
Parameters:
Parameters
----------
n_obs (int): The number of observations (rows) in the matrix.
n_vars (int): The number of variables (columns) in the matrix.
matrix_type (str): The type of matrix to generate.
Returns:
The generated matrix, either numpy.ndarray or scipy.sparse.csc_matrix or scipy.sparse.csr_matrix.
Returns
-------
np.ndarray | sp.sparse.csc_matrix | sp.sparse.csr_matrix:
The generated matrix.
Raises:
Raises
------
AssertionError: If the matrix_type is unknown.
"""
Expand Down
33 changes: 14 additions & 19 deletions src/dummy_anndata/generate_vector.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import pandas as pd
import numpy as np
import pandas as pd


def nullable_integer_array(n):
Expand Down Expand Up @@ -31,18 +31,10 @@ def missing_values_categorical(n, ordered=True):


vector_generators = {
"categorical": lambda n: pd.Categorical(
[["Value1", "Value2"][i % 2] for i in range(n)]
),
"categorical_ordered": lambda n: pd.Categorical(
[["Value1", "Value2"][i % 2] for i in range(n)], ordered=True
),
"categorical_missing_values": lambda n: missing_values_categorical(
n, ordered=False
),
"categorical_ordered_missing_values": lambda n: missing_values_categorical(
n, ordered=True
),
"categorical": lambda n: pd.Categorical([["Value1", "Value2"][i % 2] for i in range(n)]),
"categorical_ordered": lambda n: pd.Categorical([["Value1", "Value2"][i % 2] for i in range(n)], ordered=True),
"categorical_missing_values": lambda n: missing_values_categorical(n, ordered=False),
"categorical_ordered_missing_values": lambda n: missing_values_categorical(n, ordered=True),
"string_array": lambda n: np.array([f"value_{i}" for i in range(n)]),
# should we also check a 1d sparse array? We should probably leave it for the matrix generation?
"dense_array": lambda n: np.arange(n, dtype=float) + 0.5,
Expand All @@ -52,21 +44,24 @@ def missing_values_categorical(n, ordered=True):
"nullable_boolean_array": nullable_boolean_array,
}

generated_vector_types = np.ndarray | pd.Categorical | pd.arrays.IntegerArray | pd.arrays.BooleanArray

def generate_vector(n, vector_type):
def generate_vector(n: int, vector_type: str) -> generated_vector_types:
"""
Generate a vector of a specified type.
Generate a vector of a specified type and length.
Parameters:
vector_type (str): The type of vector to generate.
n (int): The length of the vector.
n (int): The length of the vector to generate.
vector_type (str): The type of vector to generate. Must be one of the keys in the `vector_generators` dictionary.
Returns:
list: The generated vector.
np.ndarray | pd.Categorical | pd.arrays.IntegerArray | pd.arrays.BooleanArray:
A vector of the specified type and length.
Raises:
AssertionError: If the vector_type is unknown.
AssertionError: If `vector_type` is not a valid key in `vector_generators`.
"""

# check if vector_type is valid
assert vector_type in vector_generators, f"Unknown vector type: {vector_type}"

Expand Down

0 comments on commit 6187fcc

Please sign in to comment.