Make REQUEST_HEADERS in LinkContentFetcher customizable
from haystack.components.fetchers.link_content import LinkContentFetcher
headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"]
fetcher = LinkContentFetcher(user_agents=headers)
streams = fetcher.run(urls=["https://zhuanlan.zhihu.com/p/670768194"])["streams"]
This error occurred when executing the above code>>>>>
HTTPError Traceback (most recent call last) Cell In[52], line 6 3 headers = ["Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"] 4 fetcher = LinkContentFetcher(user_agents=headers) ----> 6 streams = fetcher.run(urls=["https://zhuanlan.zhihu.com/p/670768194"])["streams"] 8 streams
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/haystack/components/fetchers/link_content.py:152, in LinkContentFetcher.run(self, urls) 150 # don't use multithreading if there's only one URL 151 if len(urls) == 1: --> 152 stream_metadata, stream = self._fetch(urls[0]) 153 stream.meta.update(stream_metadata) 154 stream.mime_type = stream.meta.get("content_type", None)
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/haystack/components/fetchers/link_content.py:191, in LinkContentFetcher._fetch(self, url) 189 except Exception as e: 190 if self.raise_on_failure: --> 191 raise e 192 # less verbose log as this is expected to happen often (requests failing, blocked, etc.) 193 logger.debug("Couldn't retrieve content from {url} because {error}", url=url, error=str(e))
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/haystack/components/fetchers/link_content.py:185, in LinkContentFetcher._fetch(self, url) 183 stream: ByteStream = ByteStream(data=b"") 184 try: --> 185 response = self._get_response(url) 186 content_type = self._get_content_type(response) 187 handler: Callable = self._resolve_handler(content_type)
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/tenacity/init.py:336, in BaseRetrying.wraps.
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/tenacity/init.py:475, in Retrying.call(self, fn, *args, **kwargs) 473 retry_state = RetryCallState(retry_object=self, fn=fn, args=args, kwargs=kwargs) 474 while True: --> 475 do = self.iter(retry_state=retry_state) 476 if isinstance(do, DoAttempt): 477 try:
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/tenacity/init.py:376, in BaseRetrying.iter(self, retry_state) 374 result = None 375 for action in self.iter_state.actions: --> 376 result = action(retry_state) 377 return result
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/tenacity/init.py:418, in BaseRetrying._post_stop_check_actions.
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/tenacity/init.py:185, in RetryError.reraise(self) 183 def reraise(self) -> t.NoReturn: 184 if self.last_attempt.failed: --> 185 raise self.last_attempt.result() 186 raise self
File ~/anaconda3/envs/py311/lib/python3.11/concurrent/futures/_base.py:449, in Future.result(self, timeout) 447 raise CancelledError() 448 elif self._state == FINISHED: --> 449 return self.__get_result() 451 self._condition.wait(timeout) 453 if self._state in [CANCELLED, CANCELLED_AND_NOTIFIED]:
File ~/anaconda3/envs/py311/lib/python3.11/concurrent/futures/_base.py:401, in Future.__get_result(self) 399 if self._exception: 400 try: --> 401 raise self._exception 402 finally: 403 # Break a reference cycle with the exception in self._exception 404 self = None
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/tenacity/init.py:478, in Retrying.call(self, fn, *args, **kwargs) 476 if isinstance(do, DoAttempt): 477 try: --> 478 result = fn(*args, **kwargs) 479 except BaseException: # noqa: B902 480 retry_state.set_exception(sys.exc_info()) # type: ignore[arg-type]
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/haystack/components/fetchers/link_content.py:123, in LinkContentFetcher.init.
File ~/anaconda3/envs/py311/lib/python3.11/site-packages/requests/models.py:1024, in Response.raise_for_status(self) 1019 http_error_msg = ( 1020 f"{self.status_code} Server Error: {reason} for url: {self.url}" 1021 ) 1023 if http_error_msg: -> 1024 raise HTTPError(http_error_msg, response=self)
HTTPError: 403 Client Error: Forbidden for url: https://zhuanlan.zhihu.com/p/670768194
I checked the LinkContentFetcher defined by haystack for more available parameters. I want to know how to solve this problem. Maybe I need to add a request header, cancel the proxy, and bring in parameters. How to add the request header to this component
Hello @aappaappoo and thank you for your feedback. We have a PR open that refactors the LinkContentFetcher and that that PR is about to be merged: https://github.com/deepset-ai/haystack/pull/9034
If you want you can already give it a try by installing from the link_fetcher branch by running pip install git+https://github.com/deepset-ai/haystack.git@link_fetcher
After this refactoring, we plan to add more customization of request headers as discussed here. However, I cannot give you a timeline for that yet.