onenote_export icon indicating copy to clipboard operation
onenote_export copied to clipboard

Process crashed with `AttributeError: 'MyHTMLParser' object has no attribute 'attrs'`

Open brlin-tw opened this issue 3 years ago • 1 comments

The program seems to be downloading attached images or something.

This is a Ubuntu 20.10 desktop, run by Python 3.8.6 in a Python venv.

Traceback (most recent call last)

  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 2464, in __call__

      def __call__(self, environ, start_response):
    
      """The WSGI server calls the Flask application object as the
    
      WSGI application. This calls :meth:`wsgi_app` which can be
    
      wrapped to applying middleware."""
    
      return self.wsgi_app(environ, start_response)
    
    
    
      def __repr__(self):
    
      return "<%s %r>" % (self.__class__.__name__, self.name)
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 2450, in wsgi_app

      try:
    
      ctx.push()
    
      response = self.full_dispatch_request()
    
      except Exception as e:
    
      error = e
    
      response = self.handle_exception(e)
    
      except:  # noqa: B001
    
      error = sys.exc_info()[1]
    
      raise
    
      return response(environ, start_response)
    
      finally:
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 1867, in handle_exception

      # if we want to repropagate the exception, we can attempt to
    
      # raise it with the whole traceback in case we can do that
    
      # (the function was actually called from the except part)
    
      # otherwise, we just raise the error again
    
      if exc_value is e:
    
      reraise(exc_type, exc_value, tb)
    
      else:
    
      raise e
    
    
    
      self.log_exception((exc_type, exc_value, tb))
    
      server_error = InternalServerError()
    
  • File "/venv/lib/python3.8/site-packages/flask/_compat.py", line 39, in reraise

      import collections.abc as collections_abc
    
    
    
      def reraise(tp, value, tb=None):
    
      if value.__traceback__ is not tb:
    
      raise value.with_traceback(tb)
    
      raise value
    
    
    
      implements_to_string = _identity
    
    
    
      ![Open an interactive python shell in this frame][img41]else:
    
      iterkeys = lambda d: d.iterkeys()
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 2447, in wsgi_app

      ctx = self.request_context(environ)
    
      error = None
    
      try:
    
      try:
    
      ctx.push()
    
      response = self.full_dispatch_request()
    
      except Exception as e:
    
      error = e
    
      response = self.handle_exception(e)
    
      except:  # noqa: B001
    
      error = sys.exc_info()[1]
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 1952, in full_dispatch_request

      request_started.send(self)
    
      rv = self.preprocess_request()
    
      if rv is None:
    
      rv = self.dispatch_request()
    
      except Exception as e:
    
      rv = self.handle_user_exception(e)
    
      return self.finalize_request(rv)
    
    
    
      def finalize_request(self, rv, from_error_handler=False):
    
      """Given the return value from a view function this finalizes
    
      the request by converting it into a response and invoking the
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 1821, in handle_user_exception

      return self.handle_http_exception(e)
    
    
    
      handler = self._find_error_handler(e)
    
    
    
      if handler is None:
    
      reraise(exc_type, exc_value, tb)
    
      return handler(e)
    
    
    
      def handle_exception(self, e):
    
      """Handle an exception that did not have an error handler
    
      associated with it, or that was raised from an error handler.
    
  • File "/venv/lib/python3.8/site-packages/flask/_compat.py", line 39, in reraise

      import collections.abc as collections_abc
    
    
    
      def reraise(tp, value, tb=None):
    
      if value.__traceback__ is not tb:
    
      raise value.with_traceback(tb)
    
      raise value
    
    
    
      implements_to_string = _identity
    
    
    
      ![Open an interactive python shell in this frame][img85]else:
    
      iterkeys = lambda d: d.iterkeys()
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 1950, in full_dispatch_request

      self.try_trigger_before_first_request_functions()
    
      try:
    
      request_started.send(self)
    
      rv = self.preprocess_request()
    
      if rv is None:
    
      rv = self.dispatch_request()
    
      except Exception as e:
    
      rv = self.handle_user_exception(e)
    
      return self.finalize_request(rv)
    
    
    
      def finalize_request(self, rv, from_error_handler=False):
    
  • File "/venv/lib/python3.8/site-packages/flask/app.py", line 1936, in dispatch_request

      getattr(rule, "provide_automatic_options", False)
    
      and req.method == "OPTIONS"
    
      ):
    
      return self.make_default_options_response()
    
      # otherwise dispatch to the handler for that endpoint
    
      return self.view_functions[rule.endpoint](**req.view_args)
    
    
    
      def full_dispatch_request(self):
    
      """Dispatches the request and on top of that performs request
    
      pre and postprocessing as well as HTTP exception catching and
    
      error handling.
    
  • File "/onenote_export.py", line 236, in main_logic

      ![Open an interactive python shell in this frame][img109]def main_logic():
    
      code = flask.request.args['code']
    
      token = application.acquire_token_by_authorization_code(code, scopes=scopes,
    
      redirect_uri=redirect_uri)
    
      graph_client = OAuth2Session(token=token)
    
      download_notebooks(graph_client, app.config['output_path'], app.config['select_path'], indent=0)
    
      print("Done!")
    
      return flask.render_template_string('<html>'
    
      '<head><title>Done</title></head>'
    
      '<body><p1><b>Done</b></p1></body>'
    
      '</html>')
    
  • File "/onenote_export.py", line 175, in download_notebooks

      nb_name = nb["displayName"]
    
      indent_print(indent, f'Opening notebook {nb_name}')
    
      sections = get_json(graph_client, nb['sectionsUrl'])
    
      section_groups = get_json(graph_client, nb['sectionGroupsUrl'])
    
      indent_print(indent + 1, f'Got {len(sections)} sections and {len(section_groups)} section groups.')
    
      download_sections(graph_client, sections, path / nb_name, select, indent=indent + 1)
    
      download_section_groups(graph_client, section_groups, path / nb_name, select, indent=indent + 1)
    
    
    
    
    
      ![Open an interactive python shell in this frame][img129]def download_section_groups(graph_client, section_groups, path, select=None, indent=0):
    
      section_groups, select = filter_items(section_groups, select, 'section groups', indent)
    
  • File "/onenote_export.py", line 196, in download_sections

      for sec in sections:
    
      sec_name = sec["displayName"]
    
      indent_print(indent, f'Opening section {sec_name}')
    
      pages = get_json(graph_client, sec['pagesUrl'] + '?pagelevel=true')
    
      indent_print(indent + 1, f'Got {len(pages)} pages.')
    
      download_pages(graph_client, pages, path / sec_name, select, indent=indent + 1)
    
    
    
    
    
      ![Open an interactive python shell in this frame][img139]def download_pages(graph_client, pages, path, select=None, indent=0):
    
      pages, select = filter_items(pages, select, 'pages', indent)
    
      pages = sorted([(page['order'], page) for page in pages])
    
  • File "/onenote_export.py", line 212, in download_pages

      if level == 0:
    
      page_dir = path / page_title
    
      else:
    
      page_dir = level_dirs[level - 1] / page_title
    
      level_dirs[level] = page_dir
    
      download_page(graph_client, page['contentUrl'], page_dir, indent=indent + 1)
    
    
    
    
    
      ![Open an interactive python shell in this frame][img150]def download_page(graph_client, page_url, path, indent=0):
    
      out_html = path / 'main.html'
    
      if out_html.exists():
    
  • File "/onenote_export.py", line 225, in download_page

      path.mkdir(parents=True, exist_ok=True)
    
      response = get(graph_client, page_url, indent=indent)
    
      if response is not None:
    
      content = response.text
    
      indent_print(indent, f'Got content of length {len(content)}')
    
      content = download_attachments(graph_client, content, path, indent=indent)
    
      with open(out_html, "w", encoding='utf-8') as f:
    
      f.write(content)
    
    
    
    
    
      ![Open an interactive python shell in this frame][img163]@app.route("/getToken")
    
  • File "/onenote_export.py", line 146, in download_attachments

      with open(attachment_dir / file_name, "wb") as f:
    
      f.write(data)
    
      props['data'] = "attachments/" + file_name
    
      return generate_html('object', props)
    
    
    
      content = re.sub(r"<img .*?\\/>", download_image, content, flags=re.DOTALL)
    
      content = re.sub(r"<object .*?\\/>", download_attachment, content, flags=re.DOTALL)
    
      return content
    
    
    
    
    
      ![Open an interactive python shell in this frame][img174]def indent_print(depth, text):
    
  • File "/usr/lib/python3.8/re.py", line 210, in sub

      non-overlapping occurrences of the pattern in string by the
    
      replacement repl.  repl can be either a string or a callable;
    
      if a string, backslash escapes in it are processed.  If it is
    
      a callable, it's passed the Match object and must return
    
      a replacement string to be used."""
    
      return _compile(pattern, flags).sub(repl, string, count)
    
    
    
      ![Open an interactive python shell in this frame][img182]def subn(pattern, repl, string, count=0, flags=0):
    
      """Return a 2-tuple containing (new_string, number).
    
      new_string is the string obtained by replacing the leftmost
    
      non-overlapping occurrences of the pattern in the source
    
  • File "/onenote_export.py", line 108, in download_image

      def download_image(tag_match):
    
      # <img width="843" height="218.5" src="..." data-src-type="image/png" data-fullres-src="..."
    
      # data-fullres-src-type="image/png" />
    
      parser = MyHTMLParser()
    
      parser.feed(tag_match[0])
    
      props = parser.attrs
    
      image_url = props.get('data-fullres-src', props['src'])
    
      image_type = props.get('data-fullres-src-type', props['data-src-type']).split("/")[-1]
    
      file_name = ''.join(random.choice(string.ascii_lowercase) for _ in range(10)) + '.' + image_type
    
      req = get(graph_client, image_url, indent=indent)
    
      if req is None:
    

AttributeError: 'MyHTMLParser' object has no attribute 'attrs'

brlin-tw avatar Mar 16 '21 06:03 brlin-tw

It looks to me like this is a bug in the upstream html parser. I was able to work around the issue by: Opening Onenote,

  • finding the image with the problematic alt text and
  • editing it to remove the html code fragments, or more precisely: the angle brrackets and quotes <>' ".
    • (Select image >> r-click >> Alt text...)

maphew avatar Jan 08 '22 00:01 maphew