Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement NCMEC Upvote/Downvote in reference API #1624

Closed
wants to merge 6 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
import urllib.parse

import requests
from requests.packages.urllib3.util.retry import Retry
from urllib3.util.retry import Retry
from threatexchange.exchanges.clients.utils.common import TimeoutHTTPAdapter


Expand Down Expand Up @@ -123,6 +123,19 @@ class NCMECEntryType(Enum):
video = "video"


@unique
class NCMECFeedbackType(Enum):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note to self): Add note that these are also the entry types

md5 = "MD5"
sha1 = "SHA1"
pdna = "PDNA"
pdq = "PDQ"
netclean = "NETCLEAN"
videntifier = "VIDENTIFIER"
tmk_pdqf = "TMK_PDQF"
ssvh_pdna = "SSVH_PDNA"
ssvh_safer_hash = "SSVH_SAFER_HASH"


@dataclass
class NCMECEntryUpdate:
id: str
Expand All @@ -131,6 +144,7 @@ class NCMECEntryUpdate:
deleted: bool
classification: t.Optional[str]
fingerprints: t.Dict[str, str]
feedback: t.List[t.Dict[str, t.Any]]
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note to self): Consider whether it makes sense to use the enum here, or whether to leave as string in all cases


@classmethod
def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate":
Expand All @@ -148,6 +162,36 @@ def from_xml(cls, xml: _XMLWrapper) -> "NCMECEntryUpdate":
fingerprints={
x.tag: x.text for x in xml.maybe("fingerprints") if x.has_text
},
feedback=(
[
{
"sentiment": x.tag, # "affirmativeFeedback" or "negativeFeedback"
"type": x.str("type"),
"latest_feedback_time": x.str("lastUpdateTimestamp"),
"members": [
{"id": m.str("id"), "name": m.text}
for m in x.maybe("members")
if m.has_text
],
Comment on lines +171 to +175
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note to self): Should this be dict instead of synthesizing a a dict? What about a dedicated class?

"reasons": [
{
"guid": r.maybe("reason").str("guid"),
"name": r.maybe("reason").str("name"),
"type": r.maybe("reason").str("type"),
"members": [
{"id": m.str("id"), "name": m.text}
for m in x.maybe("members")
],
}
for r in x.maybe("reasons")
if r.maybe("reason")
],
}
for x in xml.maybe("feedback")
]
if xml.maybe("feedback").has_text
else []
),
)


Expand Down Expand Up @@ -215,11 +259,49 @@ def estimated_entries_in_range(self) -> int:
)


# TODO: once we know the shape of response, finish this class
@dataclass
class UpdateEntryResponse:
updates: t.List[NCMECEntryUpdate]

@classmethod
def from_xml(
cls, xml: _XMLWrapper, fallback_max_time: int
) -> "UpdateEntryResponse":
updates: t.List[NCMECEntryUpdate] = []

for content_xml in (xml.maybe("images"), xml.maybe("videos")):
if not content_xml or not len(content_xml):
continue
updates.extend(NCMECEntryUpdate.from_xml(c) for c in content_xml)

return cls(updates)


@dataclass
class GetFeedbackReasonsResponse:
reasons: t.List[t.Dict[str, str]]

@classmethod
def from_xml(cls, xml: _XMLWrapper) -> "GetFeedbackReasonsResponse":
reasons = []
for reason in xml.maybe("availableFeedbackReasons"):
reasons.append(
{
"guid": reason.str("guid"),
"name": reason.str("name"),
"type": reason.str("type"),
}
Comment on lines +290 to +294
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note to self): Use a dedicated class here instead of dict

)
return cls(reasons)


@unique
class NCMECEndpoint(Enum):
status = "status"
entries = "entries"
members = "members"
feedback = "feedback"


class NCMECEnvironment(Enum):
Expand Down Expand Up @@ -261,15 +343,19 @@ def __init__(
username: str,
password: str,
environment: NCMECEnvironment,
member_id: t.Optional[str] = None,
reasons_map: t.Dict[str, t.List[t.Dict[str, str]]] = {},
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

blocking: We won't know this at client construction time, and can poll it afterwards.

) -> None:
assert is_valid_user_pass(username, password)
self.username = username
self.password = password
self._base_url = environment.value
self.member_id = member_id
self.reasons_map = reasons_map or {}
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm this or {} is fairly subtle. I'm assuming we are doing it because the default is {} which is a shared instance across any object, and the or {} will replace it with a new instance per call, but having it be optional and default to None is more easy to prove doesn't have a bug.


def _get_session(self) -> requests.Session:
"""
Custom requests sesson
Custom requests session

Ideally, should be used within a context manager:
```
Expand All @@ -295,14 +381,18 @@ def _get_session(self) -> requests.Session:
)
return session

def _get(self, endpoint: NCMECEndpoint, *, next_: str = "", **params) -> ET.Element:
def _get(
self, endpoint: NCMECEndpoint, *, path: str = "", next_: str = "", **params
) -> ET.Element:
"""
Perform an HTTP GET request, and return the XML response payload.

Same timeouts and retry strategy as `_get_session` above.
"""

url = "/".join((self._base_url, self.VERSION, endpoint.value))
if path:
url = "/".join((url, path))
if next_:
url = self._base_url + next_
params = {}
Expand All @@ -328,16 +418,49 @@ def _post(self, endpoint: NCMECEndpoint, *, data=None) -> t.Any:
No timeout or retry strategy.
"""

url = "/".join((self._base_url, endpoint.value))
url = "/".join((self._base_url, self.VERSION, endpoint.value))
with self._get_session() as session:
response = session.post(url, data=data)
response.raise_for_status()
return response

def _put(
self,
endpoint: NCMECEndpoint,
*,
member_id: t.Optional[str] = None,
entry_id: t.Optional[str] = None,
feedback_type: t.Optional[NCMECFeedbackType] = None,
data=None,
) -> t.Any:
"""
Perform an HTTP PUT request, and return the XML response payload.

No timeout or retry strategy.
"""

url = "/".join((self._base_url, self.VERSION, endpoint.value))
if feedback_type and member_id and entry_id:
url = "/".join(
(
self._base_url,
endpoint.value,
member_id,
entry_id,
feedback_type.value,
NCMECEndpoint.feedback.value,
)
)
with self._get_session() as session:
response = session.put(url, data=data)
response.raise_for_status()
return response

def status(self) -> StatusResult:
"""Query the status endpoint, which tells you who you are."""
response = self._get(NCMECEndpoint.status)
member = _XMLWrapper(response)["member"]
self.member_id = member.str("id")
return StatusResult(member.int("id"), member.text)

def members(self) -> t.List[StatusResult]:
Expand All @@ -348,6 +471,17 @@ def members(self) -> t.List[StatusResult]:
for member in _XMLWrapper(response)
]

def feedback_reasons(self) -> GetFeedbackReasonsResponse:
"""Get the possible negative feedback reasons for each feedback type"""
for feedbackType in NCMECFeedbackType:
resp = self._get(
NCMECEndpoint.feedback, path=f"{feedbackType.value}/reasons"
)
reasonsResp = GetFeedbackReasonsResponse.from_xml(_XMLWrapper(resp))
self.reasons_map[feedbackType.value] = reasonsResp.reasons

return reasonsResp

def get_entries(
self,
*,
Expand Down Expand Up @@ -401,6 +535,55 @@ def get_entries_iter(
has_more = bool(next_)
yield result

def submit_feedback(
self,
entry_id: str,
feedback_type: NCMECFeedbackType,
affirmative: bool,
reason_id: t.Optional[str] = None,
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note to self): Is a reason string better to use here than the guid?

) -> GetEntriesResponse:
if not affirmative and not reason_id:
raise ValueError("Negative feedback must have a reason_id")
Comment on lines +545 to +546
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(note to self): If reason map only has one item in it, choose that item. Also test NCMEC API to make sure it won't accept empty reason, which should be our pref otherwise.


# need member_id to submit feedback
if not self.member_id:
self.status()

# need valid reasons to submit negative feedback
if not affirmative and not self.reasons_map:
self.feedback_reasons()

# Prepare the XML payload
root = ET.Element("feedbackSubmission")
root.set("xmlns", "https://hashsharing.ncmec.org/hashsharing/v2")
vote = ET.SubElement(root, "affirmative" if affirmative else "negative")

if not affirmative:
valid_reason_ids = [
reason["guid"] for reason in self.reasons_map[feedback_type.value]
]
if reason_id not in valid_reason_ids:
print(
"must choose from the following reasons: ",
self.reasons_map[feedback_type.value],
)
raise ValueError("Invalid reason_id")
reasons = ET.SubElement(vote, "reasonIds")
guid = ET.SubElement(reasons, "guid")
guid.text = reason_id
# ET.dump(root)

resp = self._put(
NCMECEndpoint.entries,
member_id=self.member_id,
entry_id=entry_id,
feedback_type=feedback_type,
data=ET.tostring(root),
)

# TODO: parse response here once we know the shape using UpdateEntryResponse
return resp


def _date_format(timestamp: int) -> str:
"""ISO 8601 format yyyy-MM-dd'T'HH:mm:ss.SSSZ"""
Expand Down
Loading