Ocean-Data-Map-Project
Ocean-Data-Map-Project copied to clipboard
`data_array_to_geojson` performance profile
trafficstars
Was curious to know how this function is performing with GIOPS 10 day forecast latlon.
How I instrumented the file
pip install line_profiler
then,
diff --git a/data/transformers/geojson.py b/data/transformers/geojson.py
index adef742..8d4c415 100644
--- a/data/transformers/geojson.py
+++ b/data/transformers/geojson.py
@@ -1,3 +1,4 @@
+from line_profiler import LineProfiler
import numpy as np
import xarray as xr
@@ -5,7 +6,19 @@ from data.utils import trunc
from geojson import Feature, FeatureCollection, Point
+profiler = LineProfiler()
+def profile(func):
+ def inner(*args, **kwargs):
+ profiler.add_function(func)
+ profiler.enable_by_count()
+ return func(*args, **kwargs)
+ return inner
+
+def print_stats():
+ profiler.print_stats()
+
+@profile
def data_array_to_geojson(data_array: xr.DataArray, bearings: xr.DataArray, lat_var: xr.DataArray, lon_var: xr.DataArray) -> FeatureCollection:
"""
Converts a given xarray.DataArray, along with lat and lon keys to a geojson.FeatureCollection (subclass of dict).
@@ -81,4 +94,5 @@ def data_array_to_geojson(data_array: xr.DataArray, bearings: xr.DataArray, lat_
features.append(Feature(geometry=p, properties=props))
+ print_stats()
return FeatureCollection(features)
Timer unit: 1e-06 s
Total time: 14.2641 s
File: /home/ubuntu/onav-cloud/Ocean-Data-Map-Project/data/transformers/geojson.py
Function: data_array_to_geojson at line 21
Line # Hits Time Per Hit % Time Line Contents
==============================================================
21 @profile
22 def data_array_to_geojson(data_array: xr.DataArray, bearings: xr.DataArray, lat_var: xr.DataArray, lon_var: xr.DataArray) -> FeatureCollection:
23 """
24 Converts a given xarray.DataArray, along with lat and lon keys to a geojson.FeatureCollection (subclass of dict).
25
26 A FeatureCollection is really just a list of geojson.Feature classes.
27 Each Feature contains a geojson.Point, and a `properties` dict which holds arbitrary attributes
28 of interest for a Point. In the case of this function, each Feature has the following properties:
29
30 * The data value corresponding to a lat/lon pair (e.g. salinity or temperature)
31 * A copy of the `attribs` field held in the data_array (yes there's lots of duplication...blame the geojson spec).
32
33 Important notes:
34
35 * All data values are truncated to 3 decimal places.
36 * NaN values are not added to the returned FeatureCollection...they are skipped.
37
38 Parameters:
39 * data_array -- A 2D field (i.e. lat/lon only...time and depth dims should be sliced out).
40 * lat_key -- Key of the latitude coordinate (e.g. "latitude").
41 * lon_key -- Key of the longitude coordinate (e.g. "longitude").
42
43 Returns:
44 FeatureCollection -- the subclassed `dict` with transformed collection of geojson features.
45 """
46
47 1 23.0 23.0 0.0 if data_array.ndim != 2:
48 raise ValueError(f"Data is not a 2D field: {data_array.shape}")
49
50 # Need to ensure that data values are 64-bit floats (i.e. Python builtin float) because
51 # that's the only type of float that json will serialize without a custom serializer.
52 # Floats from netCDF4 datasets are often 32-bit.
53 1 644957.0 644957.0 4.5 data = trunc(data_array).astype(float).values
54
55 1 2.0 2.0 0.0 if bearings is not None:
56 1 650795.0 650795.0 4.6 bearings = trunc(bearings).astype(float).values
57
58 1 10.0 10.0 0.0 units_key = next((s for s in data_array.attrs.keys() if 'unit' in s), None)
59
60 1 1.0 1.0 0.0 name_key = 'long_name'
61 1 3.0 3.0 0.0 if 'long_name' not in data_array.attrs.keys():
62 1 5.0 5.0 0.0 name_key = next((s for s in data_array.attrs.keys() if 'name' in s), None)
63
64 1 1.0 1.0 0.0 attribs = {
65 1 3.0 3.0 0.0 'units': data_array.attrs[units_key],
66 1 3.0 3.0 0.0 'name': data_array.attrs[name_key],
67 }
68
69 1 2.0 2.0 0.0 def enumerate_nd_array(array: np.ndarray):
70 it = np.nditer(array, flags=['multi_index'], op_flags=['readonly'])
71 while not it.finished:
72 yield it[0], it.multi_index
73 it.iternext()
74
75 1 2.0 2.0 0.0 features = []
76 95851 354029.0 3.7 2.5 for elem, multi_idx in enumerate_nd_array(data):
77 95850 357352.0 3.7 2.5 if np.isnan(elem):
78 29320 49745.0 1.7 0.3 continue
79
80 133060 1197248.0 9.0 8.4 p = Point(
81 66530 129777.0 2.0 0.9 (
82 66530 4910270.0 73.8 34.4 ((lon_var[multi_idx[1]].item() + 180.0) % 360.0) - 180.0,
83 66530 4432117.0 66.6 31.1 lat_var[multi_idx[0]].item()
84 )
85 )
86
87 66530 158731.0 2.4 1.1 props = {
88 66530 129429.0 1.9 0.9 **attribs,
89 66530 130693.0 2.0 0.9 'data': elem.item()
90 }
91
92 66530 120442.0 1.8 0.8 if bearings is not None:
93 66530 229543.0 3.5 1.6 props['bearing'] = bearings[multi_idx].item()
94
95 66530 768963.0 11.6 5.4 features.append(Feature(geometry=p, properties=props))
96
97 print_stats()
98 return FeatureCollection(features)
I find the following lines fishy
Type casting takes a while...
- 53
- 56
Seems like accessing the lat/lon values is very slow most likely due to poor usage of CPU cache efficiency...
- 82
- 83