diff --git a/docs/source/conf.py b/docs/source/conf.py index 21a08f0..374718a 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -91,6 +91,7 @@ # html_theme = "sphinx_rtd_theme" html_theme_path = [sphinx_rtd_theme.get_html_theme_path()] +html_favicon = 'favicon.ico' # Theme options are theme-specific and customize the look and feel of a theme diff --git a/docs/source/contributing.rst b/docs/source/contributing.rst index 75f26ce..b3d90f3 100644 --- a/docs/source/contributing.rst +++ b/docs/source/contributing.rst @@ -132,6 +132,42 @@ After the bindings are generated, copy them to their respective folders in the p are their own package within the overall pymapd package. Also, take note to remove unneeded imports as shown in this `commit`_, as the unneeded imports can be problematic, especially when calling pymapd from other languages (specifically, R). +-------------------------- +Updating the Documentation +-------------------------- + +The documentation for pymapd is generated by ReadTheDocs on each commit. Some pages (such as this one) are manually created, +others such as the API Reference is generated by the docstrings from each method. + +If you are planning on making non-trival changes to the documentation and want to preview the result before making a commit, +you need to install sphinx and sphinx-rtd-theme into your development environment: + +.. code-block:: shell + + pip install sphinx sphinx-rtd-theme + +Once you have sphinx installed, to build the documentation switch to the ``pymapd/docs`` directory and run ``make html``. This will update the documentation +in the ``pymapd/docs/build/html`` directory. From that directory, running ``python -m http.server`` will allow you to preview the site on ``localhost:8000`` +in the browser. Run ``make html`` each time you save a file to see the file changes in the documentation. + +-------------------------- +Updating the Documentation +-------------------------- + +The documentation for pymapd is generated by ReadTheDocs on each commit. Some pages (such as this one) are manually created, +others such as the API Reference is generated by the docstrings from each method. + +If you are planning on making non-trival changes to the documentation and want to preview the result before making a commit, +you need to install sphinx and sphinx-rtd-theme into your development environment: + +.. code-block:: shell + + pip install sphinx sphinx-rtd-theme + +Once you have sphinx installed, to build the documentation switch to the ``pymapd/docs`` directory and run ``make html``. This will update the documentation +in the ``pymapd/docs/build/html`` directory. From that directory, running ``python -m http.server`` will allow you to preview the site on ``localhost:8000`` +in the browser. Run ``make html`` each time you save a file to see the file changes in the documentation. + -------------------------------- Publishing a new package version -------------------------------- diff --git a/docs/source/faq.rst b/docs/source/faq.rst new file mode 100644 index 0000000..885e143 --- /dev/null +++ b/docs/source/faq.rst @@ -0,0 +1,74 @@ +.. _faq: + +FAQ and Known Limitations +========================= + +This page contains information that doesn't fit into other pages or is +important enough to be called out separately. If you have a question or tidbit +of information that you feel should be included here, please create an `issue`_ +and/or `pull request`_ to get it added to this page. + +.. note:: + While we strive to keep this page updated, bugfixes and new features + are being added regularly. If information on this page conflicts with + your experience, please open an `issue`_ or drop by our `Community forum`_ + to get clarification. + + +FAQ +*** + +:Q: Why do ``select_ipc()`` and ``select_ipc_gpu()`` give me errors, but ``execute()`` + works fine? + +:A: Both ``select_ipc()`` and ``select_ipc_gpu()`` require running the pymapd code + on the same machine where OmniSci is running. This also implies that these two + methods will not work on Windows machines, just Linux (CPU and GPU) and OSX (CPU-only). + +.. + +:Q: Why do geospatial data get uploaded as ``TEXT ENCODED DICT(32)``? + +:A: When using ``load_table`` with ``create=True`` or ``create='infer'``, data + where type cannot be easily inferred will default to ``TEXT ENCODED DICT(32)``. + To solve this issue, create the table definition before loading the data. + + + +Helpful Hints +************* + +* Convert your timestamps to UTC + OmniSci stores timestamps as UTC. When loading data to OmniSci, plain Python + ``datetime`` objects are assumed to be UTC. If the ``datetime`` object has + localization, only ``datetime64[ns, UTC]`` is supported. + +* When loading data, hand-create table schema if performance is critical + While the ``load_table()`` does provide a keyword argument ``create`` to + auto-create the table before attempting to load to OmniSci, this functionality + is for *convenience purposes only*. The user is in a much better position + to know the exact data types of the input data than the heuristics used by pymapd. + + Additionally, pymapd does not attempt to use the smallest possible column + width to represent your data. For example, significant reductions in disk + storage and a larger amount of 'hot data' can be realized if your data fits + in a ``TINYINT`` column vs storing it as an ``INTEGER``. + +Known Limitations +***************** + +* OmniSci ``BIGINT`` is 64-bit + Be careful using pymapd on 32-bit systems, as we do not check for integer + overflow when returning a query. + +* ``DECIMAL`` types returned as Python ``float`` + OmniSci stores and performs ``DECIMAL`` calculations within the + database at the column-definition level of precision. However, the results + are currently returned back to Python as float. We are evaluating how to + change this behavior, so that exact decimal representations is consistent on + the server and in Python. + + +.. _issue: https://github.com/omnisci/pymapd/issues +.. _pull request: https://github.com/omnisci/pymapd/issues +.. _Community forum: https://community.omnisci.com/forum diff --git a/docs/source/favicon.ico b/docs/source/favicon.ico new file mode 100644 index 0000000..84608a6 Binary files /dev/null and b/docs/source/favicon.ico differ diff --git a/docs/source/index.rst b/docs/source/index.rst index 6523894..e72cca3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -34,6 +34,7 @@ the `Apache Arrow`_-based `cudf GPU DataFrame`_ format for efficient data interc api contributing releasenotes + faq .. _DB-API-2.0: https://www.python.org/dev/peps/pep-0249/ diff --git a/docs/source/usage.rst b/docs/source/usage.rst index 537bbdc..8df90bc 100644 --- a/docs/source/usage.rst +++ b/docs/source/usage.rst @@ -11,8 +11,9 @@ clients will feel similar to pymapd. .. note:: This tutorial assumes you have an OmniSci server running on ``localhost:6274`` with the - default logins and databases, and have loaded the example "flights_2008_10k" - dataset. + default logins and databases, and have loaded the example ``flights_2008_10k`` + dataset. This dataset can be obtained from the ``insert_sample_data`` script included + in the OmniSci install directory. Installing pymapd ----------------- @@ -31,7 +32,7 @@ pymapd pip install pymapd If you have an NVIDIA GPU in the same machine where your pymapd code will be running, you'll want to `install -cudf`_ as well if you want to return results sets into GPU memory as a cudf GPU DataFrame: +cudf`_ as well to return results sets into GPU memory as a cudf GPU DataFrame: cudf via conda ************** @@ -59,7 +60,15 @@ cudf via PyPI/pip Connecting ---------- -Create a :class:`Connection` with +Self-Hosted Install +******************* + +For self-hosted OmniSci installs, use ``protocol='binary'`` (this is the default) +to connect with OmniSci, as this will have better performance than using +``protocol='http'`` or ``protocol='https'``. + +To create a :class:`Connection` using the ``connect()`` method along with ``user``, +``password``, ``host`` and ``dbname``: .. code-block:: python @@ -69,7 +78,8 @@ Create a :class:`Connection` with >>> con Connection(mapd://mapd:***@localhost:6274/mapd?protocol=binary) -or by passing in a connection string +Alternatively, you can pass in a `SQLAlchemy`_-compliant connection string to +the ``connect()`` method: .. code-block:: python @@ -77,14 +87,13 @@ or by passing in a connection string >>> con = connect(uri=uri) Connection(mapd://mapd:***@localhost:6274/mapd?protocol=binary) -See the `SQLAlchemy`_ documentation on what makes up a connection string. The -components are:: +OmniSci Cloud +************* - dialect+driver://username:password@host:port/database +When connecting to OmniSci Cloud, the two methods are the same as above, +however you can only use ``protocol='https'``. For a step-by-step walk-through with +screenshots, please see this `blog post`_. -For ``pymapd``, the ``dialect+driver`` will always be ``mapd``, and we look for -a ``protocol`` argument in the optional query parameters (everything following -the ``?`` after ``database``). Querying -------- @@ -109,11 +118,13 @@ that your OmniSci database is running on the same machine. and microseconds granularity. Support for nanoseconds, ``Timestamp(9)`` is in progress. -GPU Select -^^^^^^^^^^ +GPU Shared Memory +***************** Use :meth:`Connection.select_ipc_gpu` to select data into a ``GpuDataFrame``, -provided by `cudf`_ +provided by `cudf`_. To use this method, **the Python code must be running +on the same machine as the OmniSci installation AND you must have an NVIDIA GPU +installed.** .. code-block:: python @@ -127,11 +138,13 @@ provided by `cudf`_ 3 4 -3 4 12 7 -CPU Shared Memory Select -^^^^^^^^^^^^^^^^^^^^^^^^ +CPU Shared Memory +***************** Use :meth:`Connection.select_ipc` to select data into a pandas ``DataFrame`` -using CPU shared memory to avoid unnecessary intermediate copies. +using CPU shared memory to avoid unnecessary intermediate copies. To use this +method, **the Python code must be running on the same machine as the OmniSci +installation.** .. code-block:: python @@ -144,10 +157,28 @@ using CPU shared memory to avoid unnecessary intermediate copies. 3 4 -3 4 12 7 +pandas.read_sql() +***************** + +With a :class:`Connection` defined, you can use ``pandass.read_sql()`` to +read your data in a pandas ``DataFrame``. This will be slower than using +:meth:`Connection.select_ipc`, but works regardless of where the Python code +is running (i.e. ``select_ipc()`` must be on the same machine as the OmniSci +install, ``pandas.read_sql()`` works everywhere): + +.. code-block:: python + + >>> from pymapd import connect + >>> import pandas as pd + >>> con = connect(user="mapd", password="HyperInteractive", host="localhost", + ... dbname="mapd") + >>> df = pd.read_sql("SELECT depdelay, arrdelay FROM flights_2008_10k limit 100", con) + + Cursors -------- +******* -A cursor can be created with :meth:`Connection.cursor` +After connecting to OmniSci, a cursor can be created with :meth:`Connection.cursor`: .. code-block:: python @@ -225,13 +256,11 @@ If you aren't using arrow or pandas you can pass list of tuples to The high-level :meth:`Connection.load_table` method will choose the fastest -method available based on the type of ``data`` and whether or not ``pyarrow`` is -installed. +method available based on the type of ``data``. * lists of tuples are always loaded with :meth:`Connection.load_table_rowwise` -* If ``pyarrow`` is installed, a ``pandas.DataFrame`` or ``pyarrow.Table`` will - be loaded using :meth:`Connection.load_table_arrow` -* If ``pyarrow`` is not installed, a ``pandas.DataFrame`` will be loaded using +* A ``pandas.DataFrame`` or ``pyarrow.Table`` will be loaded using :meth:`Connection.load_table_arrow` +* If upload fails using the arrow method, a ``pandas.DataFrame`` can be loaded using :meth:`Connection.load_table_columnar` Database Metadata @@ -260,3 +289,4 @@ Some helpful metadata are available on the ``Connection`` object. .. _Apache Arrow: http://arrow.apache.org/ .. _conda-forge: http://conda-forge.github.io/ .. _install cudf: https://github.com/rapidsai/cudf#installation +.. _blog post: https://www.omnisci.com/blog/using-pymapd-to-load-data-to-omnisci-cloud diff --git a/pymapd/_mutators.py b/pymapd/_mutators.py index 554cc59..5e336f8 100644 --- a/pymapd/_mutators.py +++ b/pymapd/_mutators.py @@ -8,7 +8,7 @@ def set_tdf(self, tdf): Parameters ---------- - tdf : TDataFrame + tdf: TDataFrame A SQL select statement Example diff --git a/pymapd/_transforms.py b/pymapd/_transforms.py index 2a1ebbf..6837bd8 100644 --- a/pymapd/_transforms.py +++ b/pymapd/_transforms.py @@ -8,12 +8,12 @@ def change_dashboard_sources(dashboard, remap): Parameters ---------- - dashboard : A dictionary containing the old dashboard state + dashboard: A dictionary containing the old dashboard state remap: A dictionary containing the new dashboard state to be mapped Returns ------- - dashboard : A base64 encoded json object containing the new dashboard state + dashboard: A base64 encoded json object containing the new dashboard state """ dm = json.loads(dashboard.dashboard_metadata) tlst = map(str.strip, dm.get('table', '').split(',')) diff --git a/pymapd/connection.py b/pymapd/connection.py index 83ffc2f..65e663f 100644 --- a/pymapd/connection.py +++ b/pymapd/connection.py @@ -45,22 +45,22 @@ def connect(uri=None, sessionid=None, ): """ - Crate a new Connection. + Create a new Connection. Parameters ---------- - uri : str - user : str - password : str - host : str - port : int - dbname : str - protocol : {'binary', 'http', 'https'} - sessionid : str + uri: str + user: str + password: str + host: str + port: int + dbname: str + protocol: {'binary', 'http', 'https'} + sessionid: str Returns ------- - conn : Connection + conn: Connection Examples -------- @@ -89,12 +89,12 @@ def _parse_uri(uri): Parameters ---------- - uri : str + uri: str a URI containing connection information Returns ------- - info : ConnectionInfo + info: ConnectionInfo Notes ------ @@ -236,7 +236,7 @@ def close(self): def commit(self): """This is a noop, as OmniSci does not provide transactions. - Implementing to comply with the specification. + Implemented to comply with the DBI specification. """ return None @@ -245,12 +245,12 @@ def execute(self, operation, parameters=None): Parameters ---------- - operation : str + operation: str A SQL statement to exucute Returns ------- - c : Cursor + c: Cursor """ c = Cursor(self) return c.execute(operation.strip(), parameters=parameters) @@ -265,25 +265,28 @@ def select_ipc_gpu(self, operation, parameters=None, device_id=0, Parameters ---------- - operation : str + operation: str A SQL statement - parameters : dict, optional + parameters: dict, optional Parameters to insert into a parametrized query - device_id : int + device_id: int GPU to return results to - first_n : int, optional + first_n: int, optional Number of records to return release_memory: bool, optional Call ``self.deallocate_ipc_gpu(df)`` after DataFrame created Returns ------- - gdf : cudf.GpuDataFrame + gdf: cudf.GpuDataFrame Notes ----- This method requires ``cudf`` and ``libcudf`` to be installed. An ``ImportError`` is raised if those aren't available. + + This method requires the Python code to be executed on the same machine + where OmniSci running. """ try: from cudf.comm.gpuarrow import GpuArrowReader # noqa @@ -314,22 +317,23 @@ def select_ipc(self, operation, parameters=None, first_n=-1, Parameters ---------- - operation : str + operation: str A SQL select statement - parameters : dict, optional + parameters: dict, optional Parameters to insert for a parametrized query - first_n : int, optional + first_n: int, optional Number of records to return release_memory: bool, optional Call ``self.deallocate_ipc(df)`` after DataFrame created Returns ------- - df : pandas.DataFrame + df: pandas.DataFrame Notes ----- - This method requires pyarrow to be installed. + This method requires the Python code to be executed on the same machine + where OmniSci running. """ if parameters is not None: @@ -364,7 +368,7 @@ def deallocate_ipc_gpu(self, df, device_id=0): Parameters ---------- - device_id : int + device_ids: int GPU which contains TDataFrame """ @@ -381,7 +385,7 @@ def deallocate_ipc(self, df, device_id=0): Parameters ---------- - device_id : int + device_id: int GPU which contains TDataFrame """ tdf = df.get_tdf() @@ -410,11 +414,11 @@ def get_table_details(self, table_name): Parameters ---------- - table_name : str + table_name: str Returns ------- - details : List[tuples] + details: List[tuples] Examples -------- @@ -434,9 +438,9 @@ def create_table(self, table_name, data, preserve_index=False): Parameters ---------- - table_name : str - data : DataFrame - preserve_index : bool, default False + table_name: str + data: DataFrame + preserve_index: bool, default False Whether to create a column in the table for the DataFrame index """ @@ -451,9 +455,9 @@ def load_table(self, table_name, data, method='infer', Parameters ---------- - table_name : str - data : pyarrow.Table, pandas.DataFrame, or iterable of tuples - method : {'infer', 'columnar', 'rows', 'arrow'} + table_name: str + data: pyarrow.Table, pandas.DataFrame, or iterable of tuples + method: {'infer', 'columnar', 'rows', 'arrow'} Method to use for loading the data. Three options are available 1. ``pyarrow`` and Apache Arrow loader @@ -467,16 +471,16 @@ def load_table(self, table_name, data, method='infer', columnar loader is used. Finally, ``data`` is an iterable of tuples the row-wise loader is used. - preserve_index : bool, default False + preserve_index: bool, default False Whether to keep the index when loading a pandas DataFrame - create : {"infer", True, False} + create: {"infer", True, False} Whether to issue a CREATE TABLE before inserting the data. - * infer : check to see if the table already exists, and create + * infer: check to see if the table already exists, and create a table if it does not - * True : attempt to create the table, without checking if it exists - * False : do not attempt to create the table + * True: attempt to create the table, without checking if it exists + * False: do not attempt to create the table See Also -------- @@ -528,8 +532,8 @@ def load_table_rowwise(self, table_name, data): Parameters ---------- - table_name : str - data : Iterable of tuples + table_name: str + data: Iterable of tuples Each element of `data` should be a row to be inserted See Also @@ -559,15 +563,15 @@ def load_table_columnar( Parameters ---------- - table_name : str - data : DataFrame - preserve_index : bool, default False + table_name: str + data: DataFrame + preserve_index: bool, default False Whether to include the index of a pandas DataFrame when writing. - chunk_size_bytes : integer, default 0 + chunk_size_bytes: integer, default 0 Chunk the loading of columns to prevent large Thrift requests. A value of 0 means do not chunk and send the dataframe as a single request - col_names_from_schema : bool, default False + col_names_from_schema: bool, default False Read the existing table schema to determine the column names. This will read the schema of an existing table in OmniSci and match those names to the column names of the dataframe. This is for @@ -585,8 +589,8 @@ def load_table_columnar( load_table_arrow load_table_rowwise - Note - ---- + Notes + ----- Use ``pymapd >= 0.11.0`` while running with ``omnisci >= 4.6.0`` in order to avoid loading inconsistent values into DATE column. """ @@ -627,9 +631,9 @@ def load_table_arrow(self, table_name, data, preserve_index=False): Parameters ---------- - table_name : str - data : pandas.DataFrame, pyarrow.RecordBatch, pyarrow.Table - preserve_index : bool, default False + table_name: str + data: pandas.DataFrame, pyarrow.RecordBatch, pyarrow.Table + preserve_index: bool, default False Whether to include the index of a pandas DataFrame when writing. Examples @@ -656,7 +660,7 @@ def render_vega(self, vega, compression_level=1): Parameters ---------- - vega : dict + vega: dict The vega specification to render. compression_level: int The level of compression for the rendered PNG. Ranges from @@ -675,7 +679,7 @@ def render_vega(self, vega, compression_level=1): def get_dashboards(self): """List all the dashboards in the database - Example + Examples -------- >>> con.get_dashboards() """ @@ -692,9 +696,9 @@ def duplicate_dashboard(self, dashboard_id, new_name=None, Parameters ---------- - dashboard_id : int + dashboard_id: int The id of the dashboard to duplicate - new_name : str + new_name: str The name for the new dashboard source_remap: dict EXPERIMENTAL @@ -703,15 +707,12 @@ def duplicate_dashboard(self, dashboard_id, new_name=None, dict with a 'name' key holding the new table value. This structure can be used later to support changing column names. - Example of source_remap format: - { - 'oldtablename1': { - 'name': 'newtablename1' - }, - 'oldtablename2': { - 'name': 'newtablename2' - } - } + + Examples + -------- + >>> source_remap = {'oldtablename1': {'name': 'newtablename1'}, \ +'oldtablename2': {'name': 'newtablename2'}} + >>> newdash = con.duplicate_dashboard(12345, "new dash", source_remap) """ source_remap = source_remap or {} d = self._client.get_dashboard( diff --git a/pymapd/cursor.py b/pymapd/cursor.py index f1dc030..3f5a715 100644 --- a/pymapd/cursor.py +++ b/pymapd/cursor.py @@ -79,9 +79,9 @@ def execute(self, operation, parameters=None): Parameters ---------- - operation : str + operation: str A SQL query - parameters : dict + parameters: dict Parameters to substitute into ``operation``. Returns @@ -127,12 +127,12 @@ def executemany(self, operation, parameters): Parameters ---------- - operation : str - parameters : list of dict + operation: str + parameters: list of dict Returns ------- - results : list of lists + results: list of lists """ results = [list(self.execute(operation, params)) for params in parameters] @@ -172,11 +172,11 @@ def make_row_results_set(data): Parameters ---------- - data : QueryResultSet + data: QueryResultSet Returns ------- - results : Iterator[tuple] + results: Iterator[tuple] """ if data.row_set.columns: diff --git a/tests/test_data_no_nulls_cpu.py b/tests/test_data_no_nulls_cpu.py index 67454c8..0884f53 100644 --- a/tests/test_data_no_nulls_cpu.py +++ b/tests/test_data_no_nulls_cpu.py @@ -95,10 +95,8 @@ def test_create_load_table_no_nulls_select_ipc(self, con, method): """ # need to manually specify columns since some don't currently work # need to drop unsupported columns from df_in - # (BOOL) https://github.com/omnisci/pymapd/issues/211 df_in = _tests_table_no_nulls(10000) - df_in.drop(columns=["bool_", - "point_", + df_in.drop(columns=["point_", "line_", "mpoly_", "poly_"], inplace=True) @@ -113,6 +111,7 @@ def test_create_load_table_no_nulls_select_ipc(self, con, method): bigint_, float_, double_, + bool_, date_, datetime_, time_,