search-tweets-python
search-tweets-python copied to clipboard
Problem in geo parsing in v2 API
Describe the bug During download tweets in academic api i had problem that collect results stops.
To Reproduce Steps (and code snippet) to reproduce the behavior: Code is part of bigger system and I can't put simple code. I cant to reproduce it on new snippet because error archive limit of monthly tweets scrap 😕
import json
from typing import List, Any, Dict
from arrow import Arrow
from searchtweets import collect_results, load_credentials, gen_request_parameters
from app.application.scrap_service import ScrapService
from app.domain.raw_json_twitter_response import RawJsonTwitterResponse
from app.util.log_util import get_logger
logger = get_logger('twitter_scrap')
class OfficialTwitterScrapService(ScrapService):
_config_file: str
_premium_search_args: Dict[str, Any]
def __init__(self, config_file: str):
self._config_file = config_file
self._premium_search_args = load_credentials(self._config_file,
yaml_key="search_tweets_premium",
env_overwrite=False)
def scrap(
self,
query: str,
since: Arrow,
until: Arrow
) -> List[RawJsonTwitterResponse]:
logger.info(
f'run scrap query :: {query}'
f' | since :: {since.isoformat()}'
f' | until :: {until.isoformat()}'
)
query = gen_request_parameters(
query=query,
granularity=None,
results_per_call=100,
start_time=self._get_string_time_from_arrow(since),
end_time=self._get_string_time_from_arrow(until),
expansions='attachments.poll_ids,attachments.media_keys,author_id,'
'entities.mentions.username,geo.place_id,in_reply_to_user_id,'
'referenced_tweets.id,referenced_tweets.id.author_id',
media_fields='duration_ms,height,media_key,preview_image_url,type,url,width,'
'public_metrics,alt_text',
place_fields='contained_within,country,country_code,full_name,geo,id,name,place_type',
tweet_fields='attachments, author_id, context_annotations, conversation_id, created_at,'
' entities, geo, id, in_reply_to_user_id, lang, public_metrics,'
' possibly_sensitive, referenced_tweets, reply_settings, source,'
' text, withheld'.replace(' ', ''),
user_fields='created_at,description,entities,id,location,name,pinned_tweet_id,'
'profile_image_url,protected,public_metrics,url,username,verified,withheld'
)
tweets = collect_results(
query,
max_tweets=10_000_000,
result_stream_args=self._premium_search_args
)
return [RawJsonTwitterResponse(json.dumps(it)) for it in tweets]
@staticmethod
def _get_string_time_from_arrow(time: Arrow) -> str:
return time.isoformat()[:-9]
Expected behavior I want to scrap tweets without error.
Environment
- Ubuntu 20.20
- Docker Python:3.8
Additional context Error log
2021-12-05 09:24:42,927 [searchtweets.result_stream ] INFO paging; total requests read so far: 103
2021-12-05 09:24:44,929 [searchtweets.result_stream ] DEBUG sending request
2021-12-05 09:24:45,971 [urllib3.connectionpool ] DEBUG https://api.twitter.com:443 "GET /2/tweets/search/all?query=%28%22%23covid%22+OR+%22%23COVID-19%22+OR+%22%23Covid19%22+OR+%22%23doros%C5%82o%C5%9B%C4%87%22+OR+%22%23generacjaX%22+OR+%22%23generacjaY%22+OR+%22%23generacjaZ%22+OR+%22%23genX%22+OR+%22%23genY%22+OR+%22%23genZ%22+OR+%22%23koronawirus%22+OR+%22%23koronawiruspolska%22+OR+%22%23liceum%22+OR+%22%23lockdown%22+OR+%22%23matura%22+OR+%22%23matura2020%22+OR+%22%23matura2021%22+OR+%22%23matura2022%22+OR+%22%23millenialsi%22+OR+%22%23m%C5%82odzi%22+OR+%22%23pandemia%22+OR+%22%23pierwszami%C5%82o%C5%9B%C4%87%22+OR+%22%23pierwszapraca%22+OR+%22%23praca2020%22+OR+%22%23praca2021%22+OR+%22%23praca2022%22+OR+%22%23pracazdalna%22+OR+%22%23praktyki%22+OR+%22%23rekrutacja2020%22+OR+%22%23rekrutacja2021%22+OR+%22%23rekrutacja2022%22+OR+%22%23siedznadupie%22+OR+%22%23solidarno%C5%9B%C4%87%22+OR+%22%23sta%C5%BC%22+OR+%22%23strajkkobiet%22+OR+%22%23studia2020%22+OR+%22%23studia2021%22+OR+%22%23studia2022%22+OR+%22%23studiazdalne%22+OR+%22%23zdalne%22+OR+%22%23zdalnenauczanie%22+OR+%22%23zostanwdomu%22%29+lang%3Apl&start_time=2020-04-13T00%3A00%3A00Z&end_time=2020-04-14T00%3A00%3A00Z&max_results=100&tweet.fields=attachments%2Cauthor_id%2Ccontext_annotations%2Cconversation_id%2Ccreated_at%2Centities%2Cgeo%2Cid%2Cin_reply_to_user_id%2Clang%2Cpublic_metrics%2Cpossibly_sensitive%2Creferenced_tweets%2Creply_settings%2Csource%2Ctext%2Cwithheld&user.fields=created_at%2Cdescription%2Centities%2Cid%2Clocation%2Cname%2Cpinned_tweet_id%2Cprofile_image_url%2Cprotected%2Cpublic_metrics%2Curl%2Cusername%2Cverified%2Cwithheld&media.fields=duration_ms%2Cheight%2Cmedia_key%2Cpreview_image_url%2Ctype%2Curl%2Cwidth%2Cpublic_metrics%2Calt_text&place.fields=contained_within%2Ccountry%2Ccountry_code%2Cfull_name%2Cgeo%2Cid%2Cname%2Cplace_type&expansions=attachments.poll_ids%2Cattachments.media_keys%2Cauthor_id%2Centities.mentions.username%2Cgeo.place_id%2Cin_reply_to_user_id%2Creferenced_tweets.id%2Creferenced_tweets.id.author_id&next_token=b26v89c19zqg8o3fo77h5m9ag2pb6dnxq7h6w432p5myl HTTP/1.1" 200 60232
Traceback (most recent call last):
File "app/main.py", line 41, in <module>
worker_loop()
File "app/main.py", line 34, in worker_loop
single_work()
File "app/main.py", line 28, in single_work
get_worker_service().run()
File "/app/app/application/worker_service.py", line 53, in run
raw_responses = self._scrap_service.scrap(
File "/app/app/infrastructure/official_twitter_scrap_service.py", line 54, in scrap
tweets = collect_results(
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 467, in collect_results
return list(rs.stream())
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 361, in stream
yield from self.formatted_output()
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 288, in formatted_output
includes_tweets[included_id] = expand_payload(included_tweet)
File "/root/.cache/pypoetry/virtualenvs/swps-tweet-infrastructure-9TtSrW0h-py3.8/lib/python3.8/site-packages/searchtweets/result_stream.py", line 270, in expand_payload
place_id = payload["geo"]['place_id']
KeyError: 'place_id'
Thanks! I think this is missing in the code - when a tweet has exact geo but no place. There's no quick workaround you can do unfortunately - not without changing some code here: https://github.com/twitterdev/search-tweets-python/blob/v2/searchtweets/result_stream.py#L269-L271
This should fix it:
if "geo" in payload and "place_id" in payload["geo"]:
place_id = payload["geo"]['place_id']
payload["geo"] = merge_dicts(payload["geo"], includes_places[place_id])
I want wait for official release (I'm build docker containers -- it will be little bit harder to prepare container with updated lib), when it will be released?
No idea, but if you don't want to wait you can install the patch directly with pip, from the branch like this:
pip install git+https://github.com/twitterdev/search-tweets-python.git@refs/pull/144/merge
Or
pip install https://github.com/igorbrigadir/search-tweets-python/archive/patch-2.zip
Or put
https://github.com/igorbrigadir/search-tweets-python/archive/patch-2.zip
Directly into a requirements.txt instead of searchtweets-v2 and your docker should build it all the same!