siphon "Too many open files" error

The code:

import datetime as dt

import numpy as np
import xarray as xr
from xarray.backends import NetCDF4DataStore
from scipy.ndimage import generic_filter

from siphon.catalog import TDSCatalog
from metpy.units import units

def obtain_narr_access():
    url = 'http://atlas.niu.edu:8080/thredds/catalog/grib/NARR/catalog.xml'
    cat = TDSCatalog(url)
    cat_ds = cat.datasets['Full Collection Dataset']
    subset_access = cat_ds.subset()
    return subset_access

def obtain_dataset(access, name, level, box, wanted_dt):
    query = access.query()
    query.variables(name)
    query.vertical_level(level)
    query.lonlat_box(*box)
    query.time(wanted_dt)
    query.accept('netcdf4')
    
    nc = access.get_data(query)
    ds = xr.open_dataset(NetCDF4DataStore(nc))
    dat = ds.metpy.parse_cf(name)
    return dat

def get_time_range(center, r):
    times = (center + dt.timedelta(i) for i in range(-r, r+1))
    return times

def get_grid_at_time(data, access, t):
    name = 'Temperature_height_above_ground'
    box = (-125, -75, 20, 55)
    try:
        res = data[t].copy()
    except KeyError:
        ds = obtain_dataset(access, name, 2., box, t)
        data[t] = ds[0,0,:]
        res = data[t].copy()
    return res, data

def compute_mean(access, data, center, n):
    name = 'Temperature_height_above_ground'
    plusminus = n // 2
    times = list(get_time_range(center, plusminus))
    
    running_tot, data = get_grid_at_time(data, access, times[0])

    for t in times[1:]:
        grid, data = get_grid_at_time(data, access, t)
        running_tot += grid
    
    res = running_tot / n
    return res, data

narr_access = obtain_narr_access()

first_date = dt.datetime(1979, 1, 3, 0)
last_date = dt.datetime(2018, 12, 29, 0)
temp_store = {}

s = 1000 // 32

biggest_inc = 0
i = 0
d = first_date
while d < last_date:
    mean1, temp_store = compute_mean(narr_access, temp_store, d, 5)
    mean2, temp_store = compute_mean(narr_access, temp_store, d+dt.timedelta(5), 5)
    change = mean2 - mean1
    smooth_change = generic_filter(change, np.mean, size=s, mode='constant')
    max_inc = np.nanmax(smooth_change)
    if max_inc > biggest_inc:
        print('New warmup:', d, max_inc)
        biggest_inc = max_inc
        biggest_info = (mean1, mean2, smooth_change, d)
    d += dt.timedelta(1)
    i += 1
    if i % 50 == 0:
        print(d)
        # Workaround
        #temp_store = {}

The output:

New warmup: 1979-01-03 00:00:00 5.348158
New warmup: 1979-01-06 00:00:00 5.718757
New warmup: 1979-01-07 00:00:00 6.124433
New warmup: 1979-01-13 00:00:00 10.476915
New warmup: 1979-01-14 00:00:00 14.078203
New warmup: 1979-01-19 00:00:00 14.185343
New warmup: 1979-02-16 00:00:00 16.56853
New warmup: 1979-02-17 00:00:00 18.901905
1979-02-22 00:00:00
1979-04-13 00:00:00
1979-06-02 00:00:00
1979-07-22 00:00:00
1979-09-10 00:00:00
1979-10-30 00:00:00
1979-12-19 00:00:00
1980-02-07 00:00:00
1980-03-28 00:00:00
1980-05-17 00:00:00
1980-07-06 00:00:00
1980-08-25 00:00:00
1980-10-14 00:00:00
1980-12-03 00:00:00
1981-01-22 00:00:00
New warmup: 1981-02-11 00:00:00 19.171156
New warmup: 1981-02-12 00:00:00 20.664352
1981-03-13 00:00:00
1981-05-02 00:00:00
1981-06-21 00:00:00
1981-08-10 00:00:00
1981-09-29 00:00:00
Traceback (most recent call last):
  File "bug.py", line 39, in get_grid_at_time
    res = data[t].copy()
KeyError: datetime.datetime(1981, 10, 14, 0, 0)

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "bug.py", line 73, in <module>
    mean2, temp_store = compute_mean(narr_access, temp_store, d+dt.timedelta(5), 5)
  File "bug.py", line 54, in compute_mean
    grid, data = get_grid_at_time(data, access, t)
  File "bug.py", line 41, in get_grid_at_time
    ds = obtain_dataset(access, name, 2., box, t)
  File "bug.py", line 26, in obtain_dataset
    nc = access.get_data(query)
  File "/home/decker/local/anaconda3/envs/unidata/lib/python3.6/site-packages/siphon/ncss.py", line 115, in get_data
    return response_handlers(resp, self.unit_handler)
  File "/home/decker/local/anaconda3/envs/unidata/lib/python3.6/site-packages/siphon/ncss.py", line 294, in __call__
    return self._reg.get(mimetype, self.default)(resp.content, unit_handler)
  File "/home/decker/local/anaconda3/envs/unidata/lib/python3.6/site-packages/siphon/ncss.py", line 380, in read_netcdf
    return Dataset(tmp_file.name, 'r')
  File "netCDF4/_netCDF4.pyx", line 2123, in netCDF4._netCDF4.Dataset.__init__
  File "netCDF4/_netCDF4.pyx", line 1743, in netCDF4._netCDF4._ensure_nc_success
OSError: [Errno 24] Too many open files: b'/tmp/tmp8g_2tbc9'

Clearly I need to close these temporary files, but searching the Siphon documentation for "close" gives no results. I can eliminate the error by occasionally clearing out my data cache (the temp_store dictionary; last line of program), but should that be necessary?

Jan 30 '19 20:01 sgdecker

You could increase the OS's file limit, but I think ds.close() would close out the xarray dataset http://xarray.pydata.org/en/stable/generated/xarray.Dataset.close.html

Jan 30 '19 21:01 jrleeman

Siphon's not really playing a role here in terms of needing an explicit close. What's going on is that you're keeping references to every xarray dataset you ever open with (in get_grid_at_time):

data[t] = ds[0,0,:]

While you do execute a slicing operation, that's still keeping a reference to the underlying file, so it never gets closed out. You may have more luck with:

data[t] = ds[0, 0, :].data.copy()

which should get the underlying numpy array and make a copy, ensuring that the underlying file is no longer referenced by the cache.

Jan 30 '19 21:01 dopplershift

Thanks for the pointer. That copy operation appears to have done the trick! Realizing this is more of an xarray issue, I now see that perhaps xarray 0.11 would fix this as well (I'm at 0.10.8), but I may be misinterpreting the description of the change here: http://xarray.pydata.org/en/stable/whats-new.html?highlight=autoclose

Jan 30 '19 21:01 sgdecker