Skip to content

Commit

Permalink
BUG: fix globbed csv to chunked dataframe (#481)
Browse files Browse the repository at this point in the history
* BUG: fix globbed csv to chunked dataframe
  • Loading branch information
llllllllll authored Sep 19, 2016
1 parent f3c2012 commit b3b1ed9
Show file tree
Hide file tree
Showing 2 changed files with 34 additions and 2 deletions.
20 changes: 18 additions & 2 deletions odo/backends/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

import pandas as pd

from dask.threaded import get as dsk_get
import datashape

from datashape import discover, Record, Option
Expand Down Expand Up @@ -366,8 +367,23 @@ def resource_glob(uri, **kwargs):
@convert.register(chunks(pd.DataFrame), (chunks(CSV), chunks(Temp(CSV))),
cost=10.0)
def convert_glob_of_csvs_to_chunks_of_dataframes(csvs, **kwargs):
data = [partial(convert, chunks(pd.DataFrame), csv, **kwargs) for csv in csvs]
return chunks(pd.DataFrame)(data)
f = partial(convert, chunks(pd.DataFrame), **kwargs)

def df_gen():
# build up a dask graph to run all of the `convert` calls concurrently

# use a list to hold the requested key names to ensure that we return
# the results in the correct order
p = []
dsk = {}
for n, csv_ in enumerate(csvs):
key = 'p%d' % n
dsk[key] = f, csv_
p.append(key)

return concat(dsk_get(dsk, p))

return chunks(pd.DataFrame)(df_gen)


@convert.register(Temp(CSV), (pd.DataFrame, chunks(pd.DataFrame)))
Expand Down
16 changes: 16 additions & 0 deletions odo/backends/tests/test_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -462,3 +462,19 @@ def test_string_n_convert(string_dshape):
expected = pd.DataFrame(raw, columns=list('kn'))
expected['k'] = pd.to_datetime(expected.k)
tm.assert_frame_equal(result, expected)


def test_globbed_csv_to_chunks_of_dataframe():
header = 'a,b,c\n'
d = {'a-1.csv': header + '1,2,3\n4,5,6\n',
'a-2.csv': header + '7,8,9\n10,11,12\n'}

with filetexts(d):
dfs = list(odo('a-*.csv', chunks(pd.DataFrame)))

assert len(dfs) == 2
columns = 'a', 'b', 'c'
tm.assert_frame_equal(dfs[0],
pd.DataFrame([[1, 2, 3], [4, 5, 6]], columns=columns))
tm.assert_frame_equal(dfs[1],
pd.DataFrame([[7, 8, 9], [10, 11, 12]], columns=columns))

0 comments on commit b3b1ed9

Please sign in to comment.