FBMessageScraper
FBMessageScraper copied to clipboard
Doesn't work anymore?
I remember this used to work, but now when I try it on a normal size conversation (a few thousand msgs) it endlessly scraps the first 2000 messages..
Yeah, I am having the exact same problem. I've spent hours trying to debug this problem but can't really think of a reason. I wonder, Why this happening.
I found out the solution while checking out Pull #4. FB has added a timestamp
field now.
Hi @dufferzafar don't you know how can I make it work with dumper.py? the pull you referenced only affects group_dumper.py. I'm trying to scrape a single conversation, but keep getting the last 2000 messages.
@szmarci the same trick is required to get it working with dumper.py
I have a similar script here, in case you need it. https://github.com/dufferzafar/Python-Scripts/blob/master/Facebook/Conversations/messages.py
@szmarci I've modified the original script to work with facebook's new timestamp
field.
Here it is:
import urllib2
import urllib
import gzip
import os
import json
import sys
import time
import StringIO
__author__ = "Raghav Sood"
__copyright__ = "Copyright 2014"
__credits__ = ["Raghav Sood"]
__license__ = "CC"
__version__ = "1.0"
__maintainer__ = "Raghav Sood"
__email__ = "[email protected]"
__status__ = "Production"
if len(sys.argv) <= 1:
print "Usage:\n python dumper.py [conversation ID] [chunk_size (recommended: 2000)] [{optional} offset location (default: 0)]"
print "Example conversation with Raghav Sood"
print " python dumper.py 1075686392 2000 0"
sys.exit()
error_timeout = 30 # Change this to alter error timeout (seconds)
general_timeout = 7 # Change this to alter waiting time afetr every request (seconds)
messages = []
talk = sys.argv[1]
offset = int(sys.argv[3]) if len(sys.argv) >= 4 else int("0")
timestamp = int("0")
messages_data = "lolno"
end_mark = "\"payload\":{\"end_of_history\""
limit = int(sys.argv[2])
headers = {"origin": "https://www.facebook.com",
"accept-encoding": "gzip,deflate",
"accept-language": "en-US,en;q=0.8",
"cookie": "your_cookie_value" # fill cookie value
"pragma": "no-cache",
"user-agent": " Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/44.0.2403.107 Safari/537.36",
"content-type": "application/x-www-form-urlencoded",
"accept": "*/*",
"cache-control": "no-cache",
"referer": "https://www.facebook.com/messages/zuck"}
base_directory = "Messages/"
directory = base_directory + str(talk) + "/"
pretty_directory = base_directory + str(talk) + "/Pretty/"
try:
os.makedirs(directory)
except OSError:
pass # already exists
try:
os.makedirs(pretty_directory)
except OSError:
pass # already exists
while end_mark not in messages_data:
data_text = {"messages[user_ids][" + str(talk) + "][offset]": str(offset),
"messages[user_ids][" + str(talk) + "][limit]": str(limit),
"messages[user_ids]["+ str(talk) + "][timestamp]": str(timestamp),
"client": "web_messenger",
"__user": "your_user_id", # fill POST form values
"__a": "your __a",
"__dyn": "your __dyn",
"__req": "your __req",
"fb_dtsg": "your_fb_dtsg",
"ttstamp": "your_ttstamp",
"__rev": "your __rev"}
data = urllib.urlencode(data_text)
url = "https://www.facebook.com/ajax/mercury/thread_info.php"
print "Retrieving messages " + str(offset) + "-" + str(limit+offset) + " for conversation ID " + str(talk)
req = urllib2.Request(url, data, headers)
response = urllib2.urlopen(req)
compressed = StringIO.StringIO(response.read())
decompressedFile = gzip.GzipFile(fileobj=compressed)
outfile = open(directory + str(offset) + "-" + str(limit+offset) + ".json", 'w')
messages_data = decompressedFile.read()
messages_data = messages_data[9:]
json_data = json.loads(messages_data)
if json_data is not None and json_data['payload'] is not None:
try:
messages = json_data['payload']['actions'] + messages
timestamp = int(json_data['payload']['actions'][0]['timestamp']) - 1
except KeyError:
pass #no more messages
else:
print "Error in retrieval. Retrying after " + str(error_timeout) + "s"
print "Data Dump:"
print json_data
time.sleep(error_timeout)
continue
outfile.write(messages_data)
outfile.close()
command = "python -mjson.tool " + directory + str(offset) + "-" + str(limit+offset) + ".json > " + pretty_directory + str(offset) + "-" + str(limit+offset) + ".pretty.json"
os.system(command)
offset = offset + limit
time.sleep(general_timeout)
finalfile = open(directory + "complete.json", 'wb')
finalfile.write(json.dumps(messages))
finalfile.close()
command = "python -mjson.tool " + directory + "complete.json > " + pretty_directory + "complete.pretty.json"
os.system(command)
This is great — seems to be working for me!
Included these changes in my fork, which also includes a stdin header parser to make your life easier, if you run this frequently like I do!