Merge pull request #9 from legouee/issue1

Support creating lazy arrays from SciPy sparse matrices (see #1)
NeuralEnsemble · Sep 12, 2017 · e487513 · e487513
2 parents dff61ad + b8ee999
commit e487513
Show file tree

Hide file tree

Showing 3 changed files with 281 additions and 3 deletions.
diff --git a/doc/tutorial.txt b/doc/tutorial.txt
@@ -240,3 +240,176 @@ The same is true for accessing individual rows or columns:
            [ 2, 14],
            [ 4, 16],
            [ 6, 18]])
+
+
+Creating lazy arrays from SciPy sparse matrices
+===============================================
+
+Lazy arrays may also be created from SciPy sparse matrices. There are 7 different sparse matrices.
+
+- csc_matrix(arg1[, shape, dtype, copy])	            Compressed Sparse Column matrix
+- csr_matrix(arg1[, shape, dtype, copy])	            Compressed Sparse Row matrix
+- bsr_matrix(arg1[, shape, dtype, copy, blocksize])	    Block Sparse Row matrix
+- lil_matrix(arg1[, shape, dtype, copy])	            Row-based linked list sparse matrix
+- dok_matrix(arg1[, shape, dtype, copy])	            Dictionary Of Keys based sparse matrix.
+- coo_matrix(arg1[, shape, dtype, copy])	            A sparse matrix in COOrdinate format.
+- dia_matrix(arg1[, shape, dtype, copy])	            Sparse matrix with DIAgonal storage
+
+Here are some examples to use them.
+
+
+Creating sparse matrices
+------------------------
+
+Sparse matrices comes from SciPy package for numerical data.
+First to use them it is necessary to import libraries.
+
+.. doctest::
+
+    >>> import numpy as np
+    >>> from lazyarray import larray
+    >>> from scipy.sparse import bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dia_matrix, dok_matrix, lil_matrix
+
+Creating a sparse matrix requires filling each row and column with data. For example :
+
+.. doctest::
+
+    >>> row = np.array([0, 2, 2, 0, 1, 2])
+    >>> col = np.array([0, 0, 1, 2, 2, 2])
+    >>> data = np.array([1, 2, 3, 4, 5, 6])
+
+The 7 sparse matrices are not defined in the same way.
+
+The bsr_matrix, coo_matrix, csc_matrix and csr_matrix are defined as follows :
+
+.. doctest::
+
+    >>> sparr = bsr_matrix((data, (row, col)), shape=(3, 3))
+    >>> sparr = coo_matrix((data, (row, col)), shape=(3, 3))
+    >>> sparr = csc_matrix((data, (row, col)), shape=(3, 3))
+    >>> sparr = csr_matrix((data, (row, col)), shape=(3, 3))
+
+In regards to the dia_matrix :
+
+.. doctest::
+
+    >>> data_dia = np.array([[1, 2, 3, 4]]).repeat(3, axis=0)
+    >>> offsets = np.array([0, -1, 2])
+    >>> sparr = dia_matrix((data_dia, offsets), shape=(4, 4))
+
+For the dok_matrix :
+
+.. doctest::
+
+    >>> sparr = dok_matrix(((row, col)), shape=(3, 3))
+
+For the lil_matrix :
+
+.. doctest::
+
+    >>> sparr = lil_matrix(data, shape=(3, 3)) 
+
+In the continuation of this tutorial, the sparse matrix used will be called sparr and refers to the csc_matrix.
+
+It is possible to convert the sparse matrix as a NumPy array, as follows:
+
+.. doctest::
+
+    >>> print(sparr.toarray())
+    array([[1, 0, 4],
+           [0, 0, 5],
+           [2, 3, 6]])
+
+
+Specifying the shape and the type of a sparse matrix
+----------------------------------------------------
+
+To know the shape and the type of the sparse matrices, you can use :
+
+.. doctest::
+
+    >>> larr = larray(sparr)
+    >>> print (larr.shape)
+    (3, 3)
+    >>> print (larr.dtype)
+    dtype('int64')
+
+
+Evaluating a sparse matrix
+--------------------------
+
+Evaluating a sparse matrix refers to the evaluate() method, which returns a NumPy array :
+
+.. doctest::
+
+    >>> print (larr.evaluate())
+    array([[1, 0, 4],
+           [0, 0, 5],
+           [2, 3, 6]])
+
+When creating a sparse matrix, some values may remain empty.
+In this case, the evaluate () method has the argument, called empty_val, referring to the special value nan, for Not a Number, defined in NumPy.
+This method fills these empty with this nan value.
+
+.. doctest::
+
+    >>> print (larr.evaluate(empty_val=np.nan))
+    array([[1, nan, 4],
+           [nan, nan, 5],
+           [2, 3, 6]])
+
+
+Accessing individual rows or columns of a sparse matrix
+-------------------------------------------------------
+
+To access specific elements of the matrix, like individual rows or columns :
+
+.. doctest::
+
+    >>> larr[2, :]
+
+In this case, the third line of the sparse matrix is obtained.
+However, this method is different depending on the sparse matrices used :
+
+For csc_matrix and csr_matrix :
+
+.. doctest::
+
+    >>> print (larr[2, :])
+    array([2, 3, 6])
+
+During execution, the matrices bsr_matrix, coo_matrix and dia_matrix, do not support indexing.
+The solution is to convert them to another format.
+It is therefore necessary to go through csr_matrix in order to perform the calculation.
+
+.. doctest::
+
+    >>> print(sparr.tocsr()[2,:])
+
+Depending on the definition given previously to the matrix, for the dok_matrix :
+
+.. doctest::
+
+    >>> print (larr[1, :])
+
+And for lil_matrix :
+
+.. doctest::
+
+    >>> print (larr[0, :])
+
+In case we want to access an element of a column, we must proceed in the same way as previously, by changing index.
+Here is an example of how to access an item in the third column of the sparse matrix.
+
+.. doctest::
+
+    >>> larr[:, 2]
+
+Finally, to have information on the sparse matrix :
+
+.. doctest::
+
+    >>>print (larr.base_value)
+    <3x3 sparse matrix of type '<class 'numpy.int64'>'
+    	with 6 stored elements in Compressed Sparse Column format>
+
diff --git a/lazyarray.py b/lazyarray.py
@@ -13,6 +13,11 @@
 from functools import wraps
 import logging
 
+from scipy import sparse
+from scipy.sparse import bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dia_matrix, dok_matrix, lil_matrix
+
+
+
 __version__ = "0.2.10"
 
 # stuff for Python 3 compatibility
@@ -136,6 +141,7 @@ def op(self):
     return op
 
 
+
 class larray(object):
     """
     Optimises storage of and operations on arrays in various ways:
@@ -150,6 +156,7 @@ class larray(object):
       - in parallelized code, different rows or columns may be evaluated
         on different nodes or in different threads.
     """
+
 
     def __init__(self, value, shape=None, dtype=None):
         """
@@ -163,6 +170,7 @@ def __init__(self, value, shape=None, dtype=None):
         and a 1D array when either `i` or `j` or both is a NumPy array (in the
         latter case the two arrays must have equal lengths).
         """
+
         self.dtype = dtype
         self.operations = []
         if isinstance(value, basestring):
@@ -174,15 +182,19 @@ def __init__(self, value, shape=None, dtype=None):
             self.base_value = value.base_value
             self.dtype = dtype or value.dtype
             self.operations = value.operations  # should deepcopy?
+
         elif isinstance(value, collections.Sized):  # False for numbers, generators, functions, iterators
-            if not isinstance(value, numpy.ndarray):
+            if sparse.issparse(value):  # For sparse matrices
+                self.dtype = dtype or value.dtype                           
+            elif not isinstance(value, numpy.ndarray):  
                 value = numpy.array(value, dtype=dtype)
             elif dtype is not None:
-                assert numpy.can_cast(value.dtype, dtype, casting='safe')  # or could convert value to the provided dtype
+               assert numpy.can_cast(value.dtype, dtype, casting='safe')  # or could convert value to the provided dtype
             if shape and value.shape != shape:
                 raise ValueError("Array has shape %s, value has shape %s" % (shape, value.shape))
             self._shape = value.shape
             self.base_value = value
+
         else:
             assert numpy.isreal(value)  # also True for callables, generators, iterators
             self._shape = shape
@@ -193,6 +205,7 @@ def __init__(self, value, shape=None, dtype=None):
                     self.base_value = dtype(value)
                 except TypeError:
                     self.base_value = value
+
 
     def __eq__(self, other):
         return self.base_value == other.base_value and self.operations == other.operations and self._shape == other.shape
@@ -339,6 +352,8 @@ def _partially_evaluate(self, addr, simplify=False):
             base_val = self._homogeneous_array(addr) * self.base_value
         elif isinstance(self.base_value, numpy.ndarray):
             base_val = self.base_value[addr]
+        elif sparse.issparse(self.base_value):  # For sparse matrices larr[2, :]
+            base_val = self.base_value[addr]
         elif callable(self.base_value):
             indices = self._array_indices(addr)
             base_val = self.base_value(*indices)
@@ -428,12 +443,13 @@ def _apply_operations(self, x, addr=None, simplify=False):
                     x = f(x, arg.evaluate(simplify=simplify))
                 else:
                     x = f(x, arg._partially_evaluate(addr, simplify=simplify))
+
             else:
                 x = f(x, arg)
         return x
 
     @requires_shape
-    def evaluate(self, simplify=False):
+    def evaluate(self, simplify=False, empty_val=0):
         """
         Return the lazy array as a real NumPy array.
 
@@ -456,6 +472,12 @@ def evaluate(self, simplify=False):
             x = self.base_value.next(self.size)
             if x.shape != self._shape:
                 x = x.reshape(self._shape)
+        elif sparse.issparse(self.base_value):  # For sparse matrices
+            if empty_val!=0:
+                x = self.base_value.toarray((sparse.csc_matrix))
+                x = numpy.where(x, x, numpy.nan)
+            else:
+                x = self.base_value.toarray((sparse.csc_matrix))
         elif isinstance(self.base_value, collections.Iterator):
             x = numpy.fromiter(self.base_value, dtype=self.dtype or float, count=self.size)
             if x.shape != self._shape:
@@ -533,3 +555,4 @@ class VectorizedIterable(object):
     obj = getattr(numpy, name)
     if isinstance(obj, numpy.ufunc):
         namespace[name] = _build_ufunc(obj)
+
diff --git a/test/test_lazyarray.py b/test/test_lazyarray.py
@@ -12,6 +12,8 @@
 from numpy.testing import assert_array_equal, assert_array_almost_equal
 import operator
 from copy import deepcopy
+from scipy.sparse import bsr_matrix, coo_matrix, csc_matrix, csr_matrix, dia_matrix, dok_matrix, lil_matrix
+
 
 
 class MockRNG(VectorizedIterable):
@@ -107,6 +109,86 @@ def test_create_with_larray():
     assert_array_equal(B.evaluate(),
                        numpy.array([102, 101, 100]))
 
+
+## For sparse matrices
+def test_create_with_sparse_array():
+    row = numpy.array([0, 2, 2, 0, 1, 2])
+    col = numpy.array([0, 0, 1, 2, 2, 2])
+    data = numpy.array([1, 2, 3, 4, 5, 6])
+    bsr = larray(bsr_matrix((data, (row, col)), shape=(3, 3))) # For bsr_matrix
+    coo = larray(coo_matrix((data, (row, col)), shape=(3, 3))) # For coo_matrix
+    csc = larray(csc_matrix((data, (row, col)), shape=(3, 3))) # For csc_matrix
+    csr = larray(csr_matrix((data, (row, col)), shape=(3, 3))) # For csr_matrix
+    data_dia = numpy.array([[1, 2, 3, 4]]).repeat(3, axis=0) # For dia_matrix
+    offsets_dia = numpy.array([0, -1, 2]) # For dia_matrix
+    dia = larray(dia_matrix((data_dia, offsets_dia), shape=(4, 4))) # For dia_matrix
+    dok = larray(dok_matrix(((row, col)), shape=(3, 3))) # For dok_matrix
+    lil = larray(lil_matrix(data, shape=(3, 3))) # For lil_matrix  
+    assert bsr.shape == (3, 3)
+    assert coo.shape == (3, 3)
+    assert csc.shape == (3, 3)
+    assert csr.shape == (3, 3)
+    assert dia.shape == (4, 4)
+    assert dok.shape == (2, 6)
+    assert lil.shape == (1, 6)
+
+    def test_evaluate_with_sparse_array():
+        assert_array_equal(bsr.evaluate(), bsr_matrix((data, (row, col))).toarray()) # For bsr_matrix
+        assert_array_equal(coo.evaluate(), coo_matrix((data, (row, col))).toarray()) # For coo_matrix
+        assert_array_equal(csc.evaluate(), csc_matrix((data, (row, col))).toarray()) # For csc_matrix
+        assert_array_equal(csr.evaluate(), csr_matrix((data, (row, col))).toarray()) # For csr_matrix
+        assert_array_equal(dia.evaluate(), dia_matrix((data_dia, (row, col))).toarray()) # For dia_matrix
+        assert_array_equal(dok.evaluate(), dok_matrix((data, (row, col))).toarray()) # For dok_matrix
+        assert_array_equal(lil.evaluate(), lil_matrix((data, (row, col))).toarray()) # For lil_matrix
+
+    def test_multiple_operations_with_sparse_array():
+        # For bsr_matrix
+        bsr0 = bsr /100.0
+        bsr1 = 0.2 + bsr0
+        assert_array_almost_equal(bsr0.evaluate(), np.array([[0.01, 0., 0.04], [0., 0., 0.05], [0.02, 0.03, 0.06]]))
+        assert_array_almost_equal(bsr0.evaluate(), np.array([[0.21, 0.2, 0.24], [0.2, 0.2, 0.25], [0.22, 0.23, 0.26]]))
+        # For coo_matrix
+        coo0 = coo /100.0
+        coo1 = 0.2 + coo0
+        assert_array_almost_equal(coo0.evaluate(), np.array([[0.01, 0., 0.04], [0., 0., 0.05], [0.02, 0.03, 0.06]]))
+        assert_array_almost_equal(coo0.evaluate(), np.array([[0.21, 0.2, 0.24], [0.2, 0.2, 0.25], [0.22, 0.23, 0.26]]))
+        # For csc_matrix
+        csc0 = csc /100.0
+        csc1 = 0.2 + csc0
+        assert_array_almost_equal(csc0.evaluate(), np.array([[0.01, 0., 0.04], [0., 0., 0.05], [0.02, 0.03, 0.06]]))
+        assert_array_almost_equal(csc0.evaluate(), np.array([[0.21, 0.2, 0.24], [0.2, 0.2, 0.25], [0.22, 0.23, 0.26]]))
+        # For csr_matrix
+        csr0 = csr /100.0
+        csr1 = 0.2 + csr0
+        assert_array_almost_equal(csc0.evaluate(), np.array([[0.01, 0., 0.04], [0., 0., 0.05], [0.02, 0.03, 0.06]]))
+        assert_array_almost_equal(csc0.evaluate(), np.array([[0.21, 0.2, 0.24], [0.2, 0.2, 0.25], [0.22, 0.23, 0.26]]))    
+        # For dia_matrix
+        dia0 = dia /100.0
+        dia1 = 0.2 + dia0
+        assert_array_almost_equal(dia0.evaluate(), np.array([[0.01, 0.02, 0.03, 0.04]]))
+        assert_array_almost_equal(dia1.evaluate(), np.array([[0.21, 0.22, 0.23, 0.24]]))
+         # For dok_matrix
+        dok0 = dok /100.0
+        dok1 = 0.2 + dok0
+        assert_array_almost_equal(dok0.evaluate(), np.array([[0., 0.02, 0.02, 0., 0.01, 0.02], [0., 0., 0.01, 0.02, 0.02, 0.02]]))
+        assert_array_almost_equal(dok1.evaluate(), np.array([[0.2, 0.22, 0.22, 0.2, 0.21, 0.22], [0.2, 0.2, 0.21, 0.22, 0.22, 0.22]]))
+         # For lil_matrix
+        lil0 = lil /100.0
+        lil1 = 0.2 + lil0
+        assert_array_almost_equal(lil0.evaluate(), np.array([[0.01, 0.02, 0.03, 0.04, 0.05, 0.06]]))
+        assert_array_almost_equal(lil1.evaluate(), np.array([[0.21, 0.22, 0.23, 0.24, 0.25, 0.26]]))
+
+
+    def test_getitem_from_2D_sparse_array():
+        assert_raises(IndexError, bsr.__getitem__, (3, 0))
+        assert_raises(IndexError, coo.__getitem__, (3, 0))
+        assert_raises(IndexError, csc.__getitem__, (3, 0))
+        assert_raises(IndexError, csr.__getitem__, (3, 0))
+        assert_raises(IndexError, dia.__getitem__, (3, 0))
+        assert_raises(IndexError, dok.__getitem__, (3, 0))
+        assert_raises(IndexError, lil.__getitem__, (3, 0))
+
+
 # def test_columnwise_iteration_with_flat_array():
 # m = larray(5, shape=(4,3)) # 4 rows, 3 columns
 #    cols = [col for col in m.by_column()]