"Too many open files" error
The code:
import datetime as dt
import numpy as np
import xarray as xr
from xarray.backends import NetCDF4DataStore
from scipy.ndimage import generic_filter
from siphon.catalog import TDSCatalog
from metpy.units import units
def obtain_narr_access():
url = 'http://atlas.niu.edu:8080/thredds/catalog/grib/NARR/catalog.xml'
cat = TDSCatalog(url)
cat_ds = cat.datasets['Full Collection Dataset']
subset_access = cat_ds.subset()
return subset_access
def obtain_dataset(access, name, level, box, wanted_dt):
query = access.query()
query.variables(name)
query.vertical_level(level)
query.lonlat_box(*box)
query.time(wanted_dt)
query.accept('netcdf4')
nc = access.get_data(query)
ds = xr.open_dataset(NetCDF4DataStore(nc))
dat = ds.metpy.parse_cf(name)
return dat
def get_time_range(center, r):
times = (center + dt.timedelta(i) for i in range(-r, r+1))
return times
def get_grid_at_time(data, access, t):
name = 'Temperature_height_above_ground'
box = (-125, -75, 20, 55)
try:
res = data[t].copy()
except KeyError:
ds = obtain_dataset(access, name, 2., box, t)
data[t] = ds[0,0,:]
res = data[t].copy()
return res, data
def compute_mean(access, data, center, n):
name = 'Temperature_height_above_ground'
plusminus = n // 2
times = list(get_time_range(center, plusminus))
running_tot, data = get_grid_at_time(data, access, times[0])
for t in times[1:]:
grid, data = get_grid_at_time(data, access, t)
running_tot += grid
res = running_tot / n
return res, data
narr_access = obtain_narr_access()
first_date = dt.datetime(1979, 1, 3, 0)
last_date = dt.datetime(2018, 12, 29, 0)
temp_store = {}
s = 1000 // 32
biggest_inc = 0
i = 0
d = first_date
while d < last_date:
mean1, temp_store = compute_mean(narr_access, temp_store, d, 5)
mean2, temp_store = compute_mean(narr_access, temp_store, d+dt.timedelta(5), 5)
change = mean2 - mean1
smooth_change = generic_filter(change, np.mean, size=s, mode='constant')
max_inc = np.nanmax(smooth_change)
if max_inc > biggest_inc:
print('New warmup:', d, max_inc)
biggest_inc = max_inc
biggest_info = (mean1, mean2, smooth_change, d)
d += dt.timedelta(1)
i += 1
if i % 50 == 0:
print(d)
# Workaround
#temp_store = {}
The output:
New warmup: 1979-01-03 00:00:00 5.348158
New warmup: 1979-01-06 00:00:00 5.718757
New warmup: 1979-01-07 00:00:00 6.124433
New warmup: 1979-01-13 00:00:00 10.476915
New warmup: 1979-01-14 00:00:00 14.078203
New warmup: 1979-01-19 00:00:00 14.185343
New warmup: 1979-02-16 00:00:00 16.56853
New warmup: 1979-02-17 00:00:00 18.901905
1979-02-22 00:00:00
1979-04-13 00:00:00
1979-06-02 00:00:00
1979-07-22 00:00:00
1979-09-10 00:00:00
1979-10-30 00:00:00
1979-12-19 00:00:00
1980-02-07 00:00:00
1980-03-28 00:00:00
1980-05-17 00:00:00
1980-07-06 00:00:00
1980-08-25 00:00:00
1980-10-14 00:00:00
1980-12-03 00:00:00
1981-01-22 00:00:00
New warmup: 1981-02-11 00:00:00 19.171156
New warmup: 1981-02-12 00:00:00 20.664352
1981-03-13 00:00:00
1981-05-02 00:00:00
1981-06-21 00:00:00
1981-08-10 00:00:00
1981-09-29 00:00:00
Traceback (most recent call last):
File "bug.py", line 39, in get_grid_at_time
res = data[t].copy()
KeyError: datetime.datetime(1981, 10, 14, 0, 0)
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "bug.py", line 73, in <module>
mean2, temp_store = compute_mean(narr_access, temp_store, d+dt.timedelta(5), 5)
File "bug.py", line 54, in compute_mean
grid, data = get_grid_at_time(data, access, t)
File "bug.py", line 41, in get_grid_at_time
ds = obtain_dataset(access, name, 2., box, t)
File "bug.py", line 26, in obtain_dataset
nc = access.get_data(query)
File "/home/decker/local/anaconda3/envs/unidata/lib/python3.6/site-packages/siphon/ncss.py", line 115, in get_data
return response_handlers(resp, self.unit_handler)
File "/home/decker/local/anaconda3/envs/unidata/lib/python3.6/site-packages/siphon/ncss.py", line 294, in __call__
return self._reg.get(mimetype, self.default)(resp.content, unit_handler)
File "/home/decker/local/anaconda3/envs/unidata/lib/python3.6/site-packages/siphon/ncss.py", line 380, in read_netcdf
return Dataset(tmp_file.name, 'r')
File "netCDF4/_netCDF4.pyx", line 2123, in netCDF4._netCDF4.Dataset.__init__
File "netCDF4/_netCDF4.pyx", line 1743, in netCDF4._netCDF4._ensure_nc_success
OSError: [Errno 24] Too many open files: b'/tmp/tmp8g_2tbc9'
Clearly I need to close these temporary files, but searching the Siphon documentation for "close" gives no results. I can eliminate the error by occasionally clearing out my data cache (the temp_store dictionary; last line of program), but should that be necessary?
You could increase the OS's file limit, but I think ds.close() would close out the xarray dataset http://xarray.pydata.org/en/stable/generated/xarray.Dataset.close.html
Siphon's not really playing a role here in terms of needing an explicit close. What's going on is that you're keeping references to every xarray dataset you ever open with (in get_grid_at_time):
data[t] = ds[0,0,:]
While you do execute a slicing operation, that's still keeping a reference to the underlying file, so it never gets closed out. You may have more luck with:
data[t] = ds[0, 0, :].data.copy()
which should get the underlying numpy array and make a copy, ensuring that the underlying file is no longer referenced by the cache.
Thanks for the pointer. That copy operation appears to have done the trick! Realizing this is more of an xarray issue, I now see that perhaps xarray 0.11 would fix this as well (I'm at 0.10.8), but I may be misinterpreting the description of the change here: http://xarray.pydata.org/en/stable/whats-new.html?highlight=autoclose