-
Notifications
You must be signed in to change notification settings - Fork 10
/
facebook_connector.py
177 lines (159 loc) · 7.06 KB
/
facebook_connector.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
"""
Author: Nigel Schuster <nigel schusters at gmail dot com>
This module provides a simple abstraction for accessing Facebook messages data
from a downloaded archive from facebook.com
"""
from __future__ import print_function
from bs4 import BeautifulSoup
from fbchat_archive_parser import parser
import logging
import os
import io
import pandas as pd
import re
import requests
import warnings
_messages_file = None
_FB_ID_PATTERN = re.compile(r"(\d+)@facebook\.com")
_mapped_fb_ids = {}
def initialize(dump_directory="."):
"""
Asserts the messages.htm file in the Facebook dump can be found and
saves its location for later usage in this module
Args:
dump_directory: path to the directory that contains messages.htm file
"""
global _messages_file
fb_message_filename = "html/messages.htm"
_messages_file = os.path.join(dump_directory, fb_message_filename)
if not os.path.isfile(_messages_file):
print("""
The directory provided did not contain messages.htm,
the directory should be within the archive
downloaded from facebook.com
""")
_messages_file = None
def resolve_user_id(fb_provided_identifier):
"""
Tries to map the provided identifier for facebook to the name of the user
Args:
fb_provided_identifier: identifier string that is provided
in the messages file. This might be the user's name or
his Facebook ID. Only if the input is a Facebook ID in the form
of a number followed by @facebook.com, then it will be resolved.
For example [email protected] will resolve to
"Nigel Schuster"
Returns:
The name of the user if it is able to find it, otherwise the input
"""
fb_id_pattern_match = _FB_ID_PATTERN.match(fb_provided_identifier)
if not fb_id_pattern_match:
# fb_provided_identifier is not in the form
# We are expecting (_FB_ID_PATTERN),
# so it should not be mapped
return fb_provided_identifier
fb_numeric_id = fb_id_pattern_match.group(1)
if fb_numeric_id in _mapped_fb_ids:
# We have looked the id up before, so just return the result
return _mapped_fb_ids[fb_numeric_id]
try:
# Try to find the user id
fb_user_page = requests.get(
"https://www.facebook.com/{}".format(fb_numeric_id))
fb_page_title = BeautifulSoup(fb_user_page.content).title.string
possible_username = fb_page_title.split("|")[0].strip()
if ((possible_username.startswith('Security Check Required')
or possible_username.startswith('Page Not Found'))):
# Mapping not found for this user, this likely is not transient
# since the HTTP request validly returned, therefore do not retry
_mapped_fb_ids[fb_numeric_id] = fb_numeric_id
logging.info(
"Failed to lookup {0} via {1}, found result {2}".format(
fb_provided_identifier, fb_numeric_id, fb_page_title))
else:
# Here we know that possible_username is the user's name
_mapped_fb_ids[fb_numeric_id] = possible_username
logging.debug("Mapped identifier {0} to {1}".format(
fb_provided_identifier, possible_username))
except Exception as e:
# Wasn't able to find the user - no harm done
_mapped_fb_ids[fb_numeric_id] = fb_numeric_id
logging.warning("Ran into error {0} for {1}".format(e, fb_numeric_id))
return _mapped_fb_ids[fb_numeric_id]
def get_cleaned_fully_merged_messages(strip_html_content=True,
resolve_fb_id=False):
"""
Parses the messages file to create dataframes that contain the messages and
their senders.
Args:
strip_html_content: The messages.htm file might contain some html tags
in messages; this option will remove all html markup
resolve_fb_id: The messages.htm file doesn't always print Facebook
names, but sometimes ids instead; this will attempt
to resolve them, but requires a web request per id and is not
guaranteed to work. Note, that this method will not
necessarily succeed, since facebook blocks the number requests
above a certain volume threshold. Setting this to true can only
improve the results, since it can always fall back to the numeric
identifier, but it will increase the time it takes.
Returns:
a dataframe that contains all messages with info about their senders
"""
if not _messages_file:
print("Please initialize the facebook_connector module.")
return
chats = None
with io.open(_messages_file, mode="rt", encoding="utf-8") as handle:
chats = parser.parse(handle=handle)
me = chats.user
addresses = set()
messages = []
# Suppressing warning that BS4 will display
# when a message only contains a URL
warnings.filterwarnings("ignore", category=UserWarning, module='bs4')
try:
threads = chats.threads.itervalues()
except AttributeError:
threads = chats.threads.values()
for thread in threads:
# This set holds the list of participants after their identifier
# has been resolved to their name (see resolve_user_id)
resolved_participants = set()
for participant in thread.participants:
if participant is not None and not participant.isspace():
resolved_participant = resolve_user_id(
participant) if resolve_fb_id else participant
resolved_participants.add(resolved_participant)
addresses.update(resolved_participants)
for message in thread.messages:
if not message.content or message.content.isspace():
continue
sender = resolve_user_id(
message.sender) if resolve_fb_id else message.sender
from_me = sender == me
if strip_html_content:
content = BeautifulSoup(message.content, "html.parser").text
else:
content = message.content
# In the following we add a single message to our dataframe
if from_me:
# If the user is sending a message to a group,
# then we need to add one message
# per group participant to the dataframe
for participant in resolved_participants:
messages.append({
'text': content,
'date': message.timestamp,
'is_from_me': from_me,
'full_name': participant
})
else:
messages.append({
'text': content,
'date': message.timestamp,
'is_from_me': from_me,
'full_name': sender
})
address_book_df = pd.DataFrame(data=list(addresses), columns=["full_name"])
messages_df = pd.DataFrame.from_records(messages)
return messages_df, address_book_df