luminaire
luminaire copied to clipboard
🐛 sklearn/numpy 'array has an inhomogeneous shape after 1 dimensions'
Running more or less vanilla github example code -- besides definition of window lengths --, for batch/streaming.
Batch runs through, however streaming throws an error when scoring test values, regarding sklearn and numpy functions. Stack-Overflowing the error reveals a known and identifiable cause, i think the problem is rooted in the way the data array is given to sklearn/numpy.
Reproducible example at google colab.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
[<ipython-input-10-50dafe264e90>](https://localhost:8080/#) in <module>
1 scoring_data = data_test.copy()
----> 2 score, scored_window = model.score(scoring_data) # scoring_data is data over a time-window instead of a datapoint
8 frames
[/usr/local/lib/python3.8/dist-packages/luminaire/model/window_density.py](https://localhost:8080/#) in score(self, data, **kwargs)
710 agg_data = self._params['AggregatedData'][opt_timestamp]
711
--> 712 is_anomaly, prob_of_anomaly, attributes = self._call_scoring(df=data,
713 target_metric=target_metric,
714 anomaly_scores_gamma_alpha=anomaly_scores_gamma_alpha,
[/usr/local/lib/python3.8/dist-packages/luminaire/model/window_density.py](https://localhost:8080/#) in _call_scoring(self, df, target_metric, anomaly_scores_gamma_alpha, anomaly_scores_gamma_loc, anomaly_scores_gamma_beta, baseline, detrend_order, detrend_method, agg_data_model, detection_method, attributes, agg_data)
487 """
488
--> 489 is_anomaly, prob_of_anomaly = self._anomalous_region_detection(input_df=df, value_column=target_metric,
490 called_for="scoring",
491 anomaly_scores_gamma_alpha=anomaly_scores_gamma_alpha,
[/usr/local/lib/python3.8/dist-packages/luminaire/model/window_density.py](https://localhost:8080/#) in _anomalous_region_detection(self, input_df, window_length, value_column, called_for, anomaly_scores_gamma_alpha, anomaly_scores_gamma_loc, anomaly_scores_gamma_beta, detrend_order, baseline, detrend_method, agg_data_model, past_model, detection_method, agg_data)
776 elif called_for == "scoring":
777
--> 778 return self._get_result(input_df=input_df,
779 detrend_order=detrend_order,
780 agg_data_model=agg_data_model,
[/usr/local/lib/python3.8/dist-packages/luminaire/model/window_density.py](https://localhost:8080/#) in _get_result(self, input_df, detrend_order, agg_data_model, value_column, detrend_method, baseline_type, detection_method, baseline, anomaly_scores_gamma_alpha, anomaly_scores_gamma_loc, anomaly_scores_gamma_beta, agg_data)
620 baseline_execution_data.append(current_adjusted_data)
621 pca = PCA()
--> 622 scores = pca.fit_transform(StandardScaler().fit_transform(baseline_execution_data))
623 robust_cov = MinCovDet().fit(scores[:, :3])
624 mahalanobis_distance = robust_cov.mahalanobis(scores[:, :3]) # getting the top 3 dimensions
[/usr/local/lib/python3.8/dist-packages/sklearn/base.py](https://localhost:8080/#) in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
[/usr/local/lib/python3.8/dist-packages/sklearn/preprocessing/_data.py](https://localhost:8080/#) in fit(self, X, y, sample_weight)
804 # Reset internal state before fitting
805 self._reset()
--> 806 return self.partial_fit(X, y, sample_weight)
807
808 def partial_fit(self, X, y=None, sample_weight=None):
[/usr/local/lib/python3.8/dist-packages/sklearn/preprocessing/_data.py](https://localhost:8080/#) in partial_fit(self, X, y, sample_weight)
839 """
840 first_call = not hasattr(self, "n_samples_seen_")
--> 841 X = self._validate_data(
842 X,
843 accept_sparse=("csr", "csc"),
[/usr/local/lib/python3.8/dist-packages/sklearn/base.py](https://localhost:8080/#) in _validate_data(self, X, y, reset, validate_separately, **check_params)
564 raise ValueError("Validation should be done on X, y or both.")
565 elif not no_val_X and no_val_y:
--> 566 X = check_array(X, **check_params)
567 out = X
568 elif no_val_X and not no_val_y:
[/usr/local/lib/python3.8/dist-packages/sklearn/utils/validation.py](https://localhost:8080/#) in check_array(array, accept_sparse, accept_large_sparse, dtype, order, copy, force_all_finite, ensure_2d, allow_nd, ensure_min_samples, ensure_min_features, estimator)
744 array = array.astype(dtype, casting="unsafe", copy=False)
745 else:
--> 746 array = np.asarray(array, order=order, dtype=dtype)
747 except ComplexWarning as complex_warning:
748 raise ValueError(
ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (52,) + inhomogeneous part.
Hi @sebapehl ,
The issue being the size of the scoring window. Window Density model tracks data over window and expects the scoring window size to be similar as the training window size (reference). In your notebook, you have set the window_size
= 7 and therefore, your test_size
needs to be 7 as well. I just gave it a try in the colab notebook and it worked fine.
Kindly let me know if you come up with any further issues!