dateparser icon indicating copy to clipboard operation
dateparser copied to clipboard

Issues with "pl" language date searching

Open czyzby opened this issue 4 years ago • 0 comments

  1. Lack of support for dot-separated date formats. See #1023.
  2. "się" and "nie" are both flagged as dates, likely since "sie" (notice lack of ę) is a shortened form of "sierpień" (August), while "nie" stands for "niedziela" (Sunday). This causes issues with search_dates, as it can affect other detected dates nearby. For example, "dziś o 11:00" (today at 11 AM) is correctly picked up as a date, but when you proceed it with nie or się (which can occur in Polish naturally), search_dates returns "nie" and "11:00" instead.
  3. Strings like "30/50" get picked up as dates. This is a common number format for addresses (street number/apartment number).
  4. MDY order seems to be preferred over the correct DMY.
  5. The following code causes an exception:
search_dates("50 1000", languages=["pl"])
TypeError                                 Traceback (most recent call last)
<ipython-input-61-e8893f27fd70> in <module>
----> 1 search_dates(line, languages=["pl"])

site-packages/dateparser/search/__init__.py in search_dates(text, languages, settings, add_detected_language)
     47 
     48         """
---> 49     result = _search_with_detection.search_dates(
     50         text=text, languages=languages, settings=settings
     51     )

site-packages/dateparser/conf.py in wrapper(*args, **kwargs)
     81                 "settings can only be either dict or instance of Settings class")
     82 
---> 83         return f(*args, **kwargs)
     84     return wrapper

site-packages/dateparser/search/search.py in search_dates(self, text, languages, settings)
    227         if not language_shortname:
    228             return {'Language': None, 'Dates': None}
--> 229         return {'Language': language_shortname, 'Dates': self.search.search_parse(language_shortname, text,
    230                                                                                   settings=settings)}

site-packages/dateparser/search/search.py in search_parse(self, shortname, text, settings)
    159         if shortname not in bad_translate_with_search:
    160             parser = DateDataParser(languages=['en'], settings=settings)
--> 161             parsed, substrings = self.parse_found_objects(parser=parser, to_parse=translated,
    162                                                           original=original, translated=translated, settings=settings)
    163         else:

site-packages/dateparser/search/search.py in parse_found_objects(self, parser, to_parse, original, translated, settings)
    139                                 for j, jtem in enumerate(split_translated):
    140                                     if len(jtem) > 2:
--> 141                                         parsed_jtem = self.parse_item(parser, jtem, split_translated[j],
    142                                                                       current_parsed, need_relative_base)
    143                                         current_parsed.append(parsed_jtem)

site-packages/dateparser/search/search.py in parse_item(self, parser, item, translated_item, parsed, need_relative_base)
    100         item = item.replace('ngày', '')
    101         item = item.replace('am', '')
--> 102         pre_parsed_item = parser.get_date_data(item)
    103         is_relative = date_is_relative(translated_item)
    104         if need_relative_base:

site-packages/dateparser/date.py in get_date_data(self, date_string, date_formats)
    414 
    415         for locale in self._get_applicable_locales(date_string):
--> 416             parsed_date = _DateLocaleParser.parse(
    417                 locale, date_string, date_formats, settings=self._settings)
    418             if parsed_date:

site-packages/dateparser/date.py in parse(cls, locale, date_string, date_formats, settings)
    187     def parse(cls, locale, date_string, date_formats=None, settings=None):
    188         instance = cls(locale, date_string, date_formats, settings)
--> 189         return instance._parse()
    190 
    191     def _parse(self):

site-packages/dateparser/date.py in _parse(self)
    197             self._try_hardcoded_formats,
    198         ):
--> 199             date_obj = parser()
    200             if self._is_valid_date_obj(date_obj):
    201                 return date_obj

site-packages/dateparser/date.py in _try_parser(self)
    219                 if _order == _default_date_order:
    220                     self._settings.DATE_ORDER = self.locale.info.get('date_order', _order)
--> 221             date_obj, period = date_parser.parse(
    222                 self._get_translated_date(), settings=self._settings)
    223             self._settings.DATE_ORDER = _order

site-packages/dateparser/conf.py in wrapper(*args, **kwargs)
     81                 "settings can only be either dict or instance of Settings class")
     82 
---> 83         return f(*args, **kwargs)
     84     return wrapper

site-packages/dateparser/date_parser.py in parse(self, date_string, settings)
     24         date_string, ptz = pop_tz_offset_from_string(date_string)
     25 
---> 26         date_obj, period = parse(date_string, settings=settings)
     27 
     28         _settings_tz = settings.TIMEZONE.lower()

site-packages/dateparser/parser.py in parse(datestring, settings)
     68             exceptions.append(e)
     69     else:
---> 70         raise exceptions.pop(-1)
     71 
     72 

site-packages/dateparser/parser.py in parse(datestring, settings)
     62     for parser in [_parser.parse, _no_spaces_parser.parse]:
     63         try:
---> 64             res = parser(datestring, settings)
     65             if res:
     66                 return res

site-packages/dateparser/parser.py in parse(cls, datestring, settings)
    445     def parse(cls, datestring, settings):
    446         tokens = tokenizer(datestring)
--> 447         po = cls(tokens.tokenize(), settings)
    448         dateobj = po._results()
    449 

site-packages/dateparser/parser.py in __init__(self, tokens, settings)
    272                 if type == 0:
    273                     params.update({attr: int(token)})
--> 274                     datetime(**params)
    275                     setattr(self, '_token_%s' % attr, token)
    276                     setattr(self, attr, int(token))

TypeError: function missing required argument 'day' (pos 3)

czyzby avatar Dec 01 '21 11:12 czyzby