parallel-tutorial
parallel-tutorial copied to clipboard
Prep.py throws an AssertionError
Hi,
I just wanted to try the tutorial after I found this video on YouTube: https://www.youtube.com/watch?v=5Md_sSsN51k
I use Anaconda on Windows 10, installed all necessary packages and after I tried to execute the first line in the notebook, I get the following error:
---------------------------------------------------------------------------
AssertionError Traceback (most recent call last)
C:\Users\Name\Documents\Python Scripts\parallel-tutorial-master\parallel-tutorial-master\prep.py in <module>()
39
40 for symbol in stocks:
---> 41 write_stock(symbol)
42
43
C:\Users\Name\Documents\Python Scripts\parallel-tutorial-master\parallel-tutorial-master\prep.py in write_stock(symbol)
35 names = [str(ts.date()) for ts in df.divisions]
36 df.to_csv(os.path.join(here, 'data', 'minute', symbol, '*.csv'),
---> 37 name_function=names.__getitem__)
38 print("Finished CSV: %s" % symbol)
39
C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\core.py in to_csv(self, filename, **kwargs)
957 """ See dd.to_csv docstring for more information """
958 from .io import to_csv
--> 959 return to_csv(self, filename, **kwargs)
960
961 def to_delayed(self):
C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\io\csv.py in to_csv(df, filename, name_function, compression, compute, get, storage_options, **kwargs)
503
504 if compute:
--> 505 delayed(values).compute(get=get)
506 else:
507 return values
C:\Users\Name\Anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs)
97 Extra keywords to forward to the scheduler ``get`` function.
98 """
---> 99 (result,) = compute(self, traverse=False, **kwargs)
100 return result
101
C:\Users\Name\Anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs)
204 dsk = collections_to_dsk(variables, opNameize_graph, **kwargs)
205 keys = [var._keys() for var in variables]
--> 206 results = get(dsk, keys, **kwargs)
207
208 results_iter = iter(results)
C:\Users\Name\Anaconda3\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, **kwargs)
73 results = get_async(pool.apply_async, len(pool._pool), dsk, result,
74 cache=cache, get_id=_thread_get_id,
---> 75 pack_exception=pack_exception, **kwargs)
76
77 # Cleanup pools associated to dead threads
C:\Users\Name\Anaconda3\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs)
519 _execute_task(task, data) # Re-execute locally
520 else:
--> 521 raise_exception(exc, tb)
522 res, worker_id = loads(res_info)
523 state['cache'][key] = res
C:\Users\Name\Anaconda3\lib\site-packages\dask\compatibility.py in reraise(exc, tb)
58 if exc.__traceback__ is not tb:
59 raise exc.with_traceback(tb)
---> 60 raise exc
61
62 else:
C:\Users\Name\Anaconda3\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception)
288 try:
289 task, data = loads(task_info)
--> 290 result = _execute_task(task, data)
291 id = get_id()
292 result = dumps((result, id))
C:\Users\Name\Anaconda3\lib\site-packages\dask\local.py in _execute_task(arg, cache, dsk)
269 func, args = arg[0], arg[1:]
270 args2 = [_execute_task(a, cache) for a in args]
--> 271 return func(*args2)
272 elif not ishashable(arg):
273 return arg
C:\Users\Name\Anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs)
45 def apply(func, args, kwargs=None):
46 if kwargs:
---> 47 return func(*args, **kwargs)
48 else:
49 return func(*args)
C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\io\demo.py in generate_day(date, open, high, low, close, volume, freq, random_state)
114 values += np.linspace(open - values[0], close - values[-1],
115 len(values)) # endpoints
--> 116 assert np.allclose(open, values[0])
117 assert np.allclose(close, values[-1])
118
AssertionError:
I also tried it unter MacOS Sierra with Miniconda and an extra enviroment as well as under Ubuntu 17.04 with Miniconda. And now I am out of operating systems :smile:
This is because of https://github.com/pydata/pandas-datareader/issues/391
We're working around it. I recommend checking back on Monday
On Fri, Sep 22, 2017 at 8:21 AM, Gandhi-uses-nukes <[email protected]
wrote:
Hi,
I just wanted to try the tutorial after I found this video on YouTube: https://www.youtube.com/watch?v=5Md_sSsN51k
I use Anaconda on Windows 10, installed all necessary packages and after I tried to execute the first line in the notebook, I get the following error:
---------------------------------------------------------------------------AssertionError Traceback (most recent call last) C:\Users\Name\Documents\Python Scripts\parallel-tutorial-master\parallel-tutorial-master\prep.py in
() 39 40 for symbol in stocks:---> 41 write_stock(symbol) 42 43 C:\Users\Name\Documents\Python Scripts\parallel-tutorial-master\parallel-tutorial-master\prep.py in write_stock(symbol) 35 names = [str(ts.date()) for ts in df.divisions] 36 df.to_csv(os.path.join(here, 'data', 'minute', symbol, '*.csv'),---> 37 name_function=names.getitem) 38 print("Finished CSV: %s" % symbol) 39
C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\core.py in to_csv(self, filename, **kwargs) 957 """ See dd.to_csv docstring for more information """ 958 from .io import to_csv--> 959 return to_csv(self, filename, **kwargs) 960 961 def to_delayed(self):
C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\io\csv.py in to_csv(df, filename, name_function, compression, compute, get, storage_options, **kwargs) 503 504 if compute:--> 505 delayed(values).compute(get=get) 506 else: 507 return values
C:\Users\Name\Anaconda3\lib\site-packages\dask\base.py in compute(self, **kwargs) 97 Extra keywords to forward to the scheduler
getfunction. 98 """---> 99 (result,) = compute(self, traverse=False, **kwargs) 100 return result 101 C:\Users\Name\Anaconda3\lib\site-packages\dask\base.py in compute(*args, **kwargs) 204 dsk = collections_to_dsk(variables, opNameize_graph, **kwargs) 205 keys = [var._keys() for var in variables]--> 206 results = get(dsk, keys, **kwargs) 207 208 results_iter = iter(results)C:\Users\Name\Anaconda3\lib\site-packages\dask\threaded.py in get(dsk, result, cache, num_workers, **kwargs) 73 results = get_async(pool.apply_async, len(pool._pool), dsk, result, 74 cache=cache, get_id=_thread_get_id,---> 75 pack_exception=pack_exception, **kwargs) 76 77 # Cleanup pools associated to dead threadsC:\Users\Name\Anaconda3\lib\site-packages\dask\local.py in get_async(apply_async, num_workers, dsk, result, cache, get_id, rerun_exceptions_locally, pack_exception, raise_exception, callbacks, dumps, loads, **kwargs) 519 _execute_task(task, data) # Re-execute locally 520 else:--> 521 raise_exception(exc, tb) 522 res, worker_id = loads(res_info) 523 state['cache'][key] = resC:\Users\Name\Anaconda3\lib\site-packages\dask\compatibility.py in reraise(exc, tb) 58 if exc.traceback is not tb: 59 raise exc.with_traceback(tb)---> 60 raise exc 61 62 else:C:\Users\Name\Anaconda3\lib\site-packages\dask\local.py in execute_task(key, task_info, dumps, loads, get_id, pack_exception) 288 try: 289 task, data = loads(task_info)--> 290 result = _execute_task(task, data) 291 id = get_id() 292 result = dumps((result, id))C:\Users\Name\Anaconda3\lib\site-packages\dask\local.py in _execute_task(arg, cache, dsk) 269 func, args = arg[0], arg[1:] 270 args2 = [_execute_task(a, cache) for a in args]--> 271 return func(*args2) 272 elif not ishashable(arg): 273 return argC:\Users\Name\Anaconda3\lib\site-packages\dask\compatibility.py in apply(func, args, kwargs) 45 def apply(func, args, kwargs=None): 46 if kwargs:---> 47 return func(*args, **kwargs) 48 else: 49 return func(*args)C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\io\demo.py in generate_day(date, open, high, low, close, volume, freq, random_state) 114 values += np.linspace(open - values[0], close - values[-1], 115 len(values)) # endpoints--> 116 assert np.allclose(open, values[0]) 117 assert np.allclose(close, values[-1]) 118 AssertionError:I also tried it unter MacOS Sierra with Miniconda and an extra enviroment as well as under Ubuntu 17.04 with Miniconda. And now I am out of operating systems 😄
— You are receiving this because you are subscribed to this thread. Reply to this email directly, view it on GitHub https://github.com/pydata/parallel-tutorial/issues/26, or mute the thread https://github.com/notifications/unsubscribe-auth/AASszA0tVTC-3NCAFNi3etHV61qBSJmOks5sk6ZQgaJpZM4PgoSR .
As per the instructor, seems it's updated in the repo, but if you've already built the environment, you can update just the relevant dependency with conda install dask -c conda-forge
I did a new 'git clone ...' and run python prep.py
This time it worked until
C:\Users\Name\Documents\Python Scripts\parallel-tutorial>python prep.py
Finished CSV: afl
Finished CSV: aig
Finished CSV: al
Finished CSV: avy
Finished CSV: bwa
Traceback (most recent call last):
File "prep.py", line 44, in <module>
write_stock(symbol)
File "prep.py", line 37, in write_stock
data_source='google')
File "C:\Users\Name\Anaconda3\lib\site-packages\dask\dataframe\io\demo.py", line 187, in daily_stock
df = data.DataReader(symbol, data_source, start, stop)
File "C:\Users\Name\Anaconda3\lib\site-packages\pandas_datareader\data.py", line 137, in DataReader
session=session).read()
File "C:\Users\Name\Anaconda3\lib\site-packages\pandas_datareader\base.py", line 181, in read
params=self._get_params(self.symbols))
File "C:\Users\Name\Anaconda3\lib\site-packages\pandas_datareader\base.py", line 79, in _read_one_data
out = self._read_url_as_StringIO(url, params=params)
File "C:\Users\Name\Anaconda3\lib\site-packages\pandas_datareader\base.py", line 90, in _read_url_as_StringIO
response = self._get_response(url, params=params)
File "C:\Users\Name\Anaconda3\lib\site-packages\pandas_datareader\base.py", line 139, in _get_response
raise RemoteDataError('Unable to read URL: {0}'.format(url))
pandas_datareader._utils.RemoteDataError: Unable to read URL: http://www.google.com/finance/historical?q=hal&startdate=Sep+26%2C+2016&enddate=Sep+26%2C+2017&output=csv
But I could easily fix it myself. Apparently Google has no data in the specified date range.
I removed hal and ibm, then everything worked.
Surprised this is still open, but Google changed their API for historical stock data. So this is sort of unfixable now without migrating to a different data source. See https://github.com/pydata/parallel-tutorial/issues/30 and the discussion at the end of https://github.com/pydata/pandas-datareader/issues/391.