FEDOT icon indicating copy to clipboard operation
FEDOT copied to clipboard

DataMerger fails when merging indices with different types

Open gkirgizov opened this issue 2 years ago • 1 comments

Failing example. Here it's seen that str index can't be merged with int index. Possibly this would also fail for two non-numeric indices.

2022-07-28T15:17:14.1922292Z test/unit/utilities/test_project_import_export.py:88: 
2022-07-28T15:17:14.1922714Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
2022-07-28T15:17:14.1923082Z fedot/api/main.py:164: in fit
2022-07-28T15:17:14.1923533Z     self._train_pipeline_on_full_dataset(recommendations, full_train_not_preprocessed)
2022-07-28T15:17:14.1924029Z fedot/api/main.py:422: in _train_pipeline_on_full_dataset
2022-07-28T15:17:14.1924537Z     n_jobs=self.params.api_params['n_jobs'],
2022-07-28T15:17:14.1924958Z fedot/core/pipelines/pipeline.py:161: in fit
2022-07-28T15:17:14.1925366Z     use_fitted_operations=use_fitted)
2022-07-28T15:17:14.1925818Z fedot/core/pipelines/pipeline.py:116: in _fit
2022-07-28T15:17:14.1926265Z     train_predicted = self.root_node.fit(input_data=input_data)
2022-07-28T15:17:14.1926664Z fedot/core/pipelines/node.py:335: in fit
2022-07-28T15:17:14.1927276Z     secondary_input = self._input_from_parents(input_data=input_data, parent_operation='fit')
2022-07-28T15:17:14.1927789Z fedot/core/pipelines/node.py:363: in _input_from_parents
2022-07-28T15:17:14.1928185Z     parent_operation)
2022-07-28T15:17:14.1928575Z fedot/core/pipelines/node.py:402: in _combine_parents
2022-07-28T15:17:14.1929017Z     prediction = parent.fit(input_data=input_data)
2022-07-28T15:17:14.1929430Z fedot/core/pipelines/node.py:335: in fit
2022-07-28T15:17:14.1930029Z     secondary_input = self._input_from_parents(input_data=input_data, parent_operation='fit')
2022-07-28T15:17:14.1930525Z fedot/core/pipelines/node.py:363: in _input_from_parents
2022-07-28T15:17:14.1930915Z     parent_operation)
2022-07-28T15:17:14.1931297Z fedot/core/pipelines/node.py:402: in _combine_parents
2022-07-28T15:17:14.1931738Z     prediction = parent.fit(input_data=input_data)
2022-07-28T15:17:14.1932151Z fedot/core/pipelines/node.py:335: in fit
2022-07-28T15:17:14.1932749Z     secondary_input = self._input_from_parents(input_data=input_data, parent_operation='fit')
2022-07-28T15:17:14.1933247Z fedot/core/pipelines/node.py:365: in _input_from_parents
2022-07-28T15:17:14.1933699Z     secondary_input = DataMerger.get(parent_results).merge()
2022-07-28T15:17:14.1934152Z fedot/core/data/merge/data_merger.py:56: in get
2022-07-28T15:17:14.1934528Z     return cls(outputs, data_type)
2022-07-28T15:17:14.1934935Z fedot/core/data/merge/data_merger.py:30: in __init__
2022-07-28T15:17:14.1935369Z     self.common_indices = find_common_elements(*idx_list)
2022-07-28T15:17:14.1935822Z fedot/core/data/array_utilities.py:10: in find_common_elements
2022-07-28T15:17:14.1936405Z     common_elements = reduce(np.intersect1d, indices[1:], indices[0])
2022-07-28T15:17:14.1936873Z <__array_function__ internals>:6: in intersect1d
2022-07-28T15:17:14.1937204Z     ???
2022-07-28T15:17:14.1937537Z _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ 
2022-07-28T15:17:14.1937801Z 
2022-07-28T15:17:14.1938176Z ar1 = array(['Alpen gold', 'Alpen gold1', 'Rossia shedraya dusha',
2022-07-28T15:17:14.1938848Z        'Rossia shedraya dusha1', 'Shipuchka', 'Shipuchka1...acks1', 'Werthers Original Caramel',
2022-07-28T15:17:14.1939461Z        'Werthers Original Caramel1', 'Whoppers', 'Whoppers1'],
2022-07-28T15:17:14.1939851Z       dtype=object)
2022-07-28T15:17:14.1940285Z ar2 = array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
2022-07-28T15:17:14.1940616Z        17, 18, 19])
2022-07-28T15:17:14.1940982Z assume_unique = False, return_indices = False
2022-07-28T15:17:14.1941265Z 
2022-07-28T15:17:14.1941512Z     @array_function_dispatch(_intersect1d_dispatcher)
2022-07-28T15:17:14.1941956Z     def intersect1d(ar1, ar2, assume_unique=False, return_indices=False):
2022-07-28T15:17:14.1942422Z         """
2022-07-28T15:17:14.1942780Z         Find the intersection of two arrays.
2022-07-28T15:17:14.1943124Z     
2022-07-28T15:17:14.1943516Z         Return the sorted, unique values that are in both of the input arrays.
2022-07-28T15:17:14.1943892Z     
2022-07-28T15:17:14.1944191Z         Parameters
2022-07-28T15:17:14.1944546Z         ----------
2022-07-28T15:17:14.1944881Z         ar1, ar2 : array_like
2022-07-28T15:17:14.1945280Z             Input arrays. Will be flattened if not already 1D.
2022-07-28T15:17:14.1945678Z         assume_unique : bool
2022-07-28T15:17:14.1946097Z             If True, the input arrays are both assumed to be unique, which
2022-07-28T15:17:14.1946562Z             can speed up the calculation.  Default is False.
2022-07-28T15:17:14.1946936Z         return_indices : bool
2022-07-28T15:17:14.1947352Z             If True, the indices which correspond to the intersection of the two
2022-07-28T15:17:14.1947857Z             arrays are returned. The first instance of a value is used if there are
2022-07-28T15:17:14.1948299Z             multiple. Default is False.
2022-07-28T15:17:14.1948636Z     
2022-07-28T15:17:14.1949000Z             .. versionadded:: 1.15.0
2022-07-28T15:17:14.1949310Z     
2022-07-28T15:17:14.1949600Z         Returns
2022-07-28T15:17:14.1949957Z         -------
2022-07-28T15:17:14.1950290Z         intersect1d : ndarray
2022-07-28T15:17:14.1950844Z             Sorted 1D array of common and unique elements.
2022-07-28T15:17:14.1951220Z         comm1 : ndarray
2022-07-28T15:17:14.1951622Z             The indices of the first occurrences of the common values in `ar1`.
2022-07-28T15:17:14.1952092Z             Only provided if `return_indices` is True.
2022-07-28T15:17:14.1952465Z         comm2 : ndarray
2022-07-28T15:17:14.1952863Z             The indices of the first occurrences of the common values in `ar2`.
2022-07-28T15:17:14.1953327Z             Only provided if `return_indices` is True.
2022-07-28T15:17:14.1953749Z     
2022-07-28T15:17:14.1954015Z     
2022-07-28T15:17:14.1954298Z         See Also
2022-07-28T15:17:14.1954665Z         --------
2022-07-28T15:17:14.1955087Z         numpy.lib.arraysetops : Module with a number of other functions for
2022-07-28T15:17:14.1955566Z                                 performing set operations on arrays.
2022-07-28T15:17:14.1955907Z     
2022-07-28T15:17:14.1956201Z         Examples
2022-07-28T15:17:14.1956552Z         --------
2022-07-28T15:17:14.1956909Z         >>> np.intersect1d([1, 3, 4, 3], [3, 1, 2, 1])
2022-07-28T15:17:14.1957404Z         array([1, 3])
2022-07-28T15:17:14.1958062Z     
2022-07-28T15:17:14.1958418Z         To intersect more than two arrays, use functools.reduce:
2022-07-28T15:17:14.1958687Z     
2022-07-28T15:17:14.1958903Z         >>> from functools import reduce
2022-07-28T15:17:14.1959204Z         >>> reduce(np.intersect1d, ([1, 3, 4, 3], [3, 1, 2, 1], [6, 3, 4, 2]))
2022-07-28T15:17:14.1959650Z         array([3])
2022-07-28T15:17:14.1959849Z     
2022-07-28T15:17:14.1960105Z         To return the indices of the values common to the input arrays
2022-07-28T15:17:14.1960415Z         along with the intersected values:
2022-07-28T15:17:14.1960641Z     
2022-07-28T15:17:14.1960836Z         >>> x = np.array([1, 1, 2, 3, 4])
2022-07-28T15:17:14.1961075Z         >>> y = np.array([2, 1, 4, 6])
2022-07-28T15:17:14.1961369Z         >>> xy, x_ind, y_ind = np.intersect1d(x, y, return_indices=True)
2022-07-28T15:17:14.1961628Z         >>> x_ind, y_ind
2022-07-28T15:17:14.1961860Z         (array([0, 2, 4]), array([1, 0, 2]))
2022-07-28T15:17:14.1962093Z         >>> xy, x[x_ind], y[y_ind]
2022-07-28T15:17:14.1962332Z         (array([1, 2, 4]), array([1, 2, 4]), array([1, 2, 4]))
2022-07-28T15:17:14.1962549Z     
2022-07-28T15:17:14.1962724Z         """
2022-07-28T15:17:14.1962921Z         ar1 = np.asanyarray(ar1)
2022-07-28T15:17:14.1963156Z         ar2 = np.asanyarray(ar2)
2022-07-28T15:17:14.1963358Z     
2022-07-28T15:17:14.1963553Z         if not assume_unique:
2022-07-28T15:17:14.1963854Z             if return_indices:
2022-07-28T15:17:14.1964115Z                 ar1, ind1 = unique(ar1, return_index=True)
2022-07-28T15:17:14.1964403Z                 ar2, ind2 = unique(ar2, return_index=True)
2022-07-28T15:17:14.1964618Z             else:
2022-07-28T15:17:14.1964825Z                 ar1 = unique(ar1)
2022-07-28T15:17:14.1965042Z                 ar2 = unique(ar2)
2022-07-28T15:17:14.1965233Z         else:
2022-07-28T15:17:14.1965433Z             ar1 = ar1.ravel()
2022-07-28T15:17:14.1965646Z             ar2 = ar2.ravel()
2022-07-28T15:17:14.1965829Z     
2022-07-28T15:17:14.1966048Z         aux = np.concatenate((ar1, ar2))
2022-07-28T15:17:14.1966290Z         if return_indices:
2022-07-28T15:17:14.1966764Z             aux_sort_indices = np.argsort(aux, kind='mergesort')
2022-07-28T15:17:14.1967042Z             aux = aux[aux_sort_indices]
2022-07-28T15:17:14.1967256Z         else:
2022-07-28T15:17:14.1967442Z >           aux.sort()
2022-07-28T15:17:14.1967829Z E           TypeError: '<' not supported between instances of 'int' and 'str'
2022-07-28T15:17:14.1968036Z 

gkirgizov avatar Jul 29 '22 11:07 gkirgizov

@gkirgizov еще актуально? Выглядит как то, что так и должно быть. Непонятно как мержить str и int в общем случае

valer1435 avatar Sep 28 '23 09:09 valer1435