Skip to content
Snippets Groups Projects
Unverified Commit 02f99906 authored by Richard van der Hoff's avatar Richard van der Hoff Committed by GitHub
Browse files

Merge pull request #6331 from matrix-org/rav/url_preview_limit_title

Strip overlong OpenGraph data from url preview
parents 40860028 55a7da24
No related branches found
No related tags found
No related merge requests found
Limit the length of data returned by url previews, to prevent DoS attacks.
......@@ -56,6 +56,9 @@ logger = logging.getLogger(__name__)
_charset_match = re.compile(br"<\s*meta[^>]*charset\s*=\s*([a-z0-9-]+)", flags=re.I)
_content_type_match = re.compile(r'.*; *charset="?(.*?)"?(;|$)', flags=re.I)
OG_TAG_NAME_MAXLEN = 50
OG_TAG_VALUE_MAXLEN = 1000
class PreviewUrlResource(DirectServeResource):
isLeaf = True
......@@ -171,7 +174,7 @@ class PreviewUrlResource(DirectServeResource):
ts (int):
Returns:
Deferred[str]: json-encoded og data
Deferred[bytes]: json-encoded og data
"""
# check the URL cache in the DB (which will also provide us with
# historical previews, if we have any)
......@@ -272,6 +275,17 @@ class PreviewUrlResource(DirectServeResource):
logger.warning("Failed to find any OG data in %s", url)
og = {}
# filter out any stupidly long values
keys_to_remove = []
for k, v in og.items():
if len(k) > OG_TAG_NAME_MAXLEN or len(v) > OG_TAG_VALUE_MAXLEN:
logger.warning(
"Pruning overlong tag %s from OG data", k[:OG_TAG_NAME_MAXLEN]
)
keys_to_remove.append(k)
for k in keys_to_remove:
del og[k]
logger.debug("Calculated OG for %s as %s", url, og)
jsonog = json.dumps(og)
......@@ -506,6 +520,10 @@ def _calc_og(tree, media_uri):
og = {}
for tag in tree.xpath("//*/meta[starts-with(@property, 'og:')]"):
if "content" in tag.attrib:
# if we've got more than 50 tags, someone is taking the piss
if len(og) >= 50:
logger.warning("Skipping OG for page with too many 'og:' tags")
return {}
og[tag.attrib["property"]] = tag.attrib["content"]
# TODO: grab article: meta tags too, e.g.:
......
......@@ -247,6 +247,41 @@ class URLPreviewTests(unittest.HomeserverTestCase):
self.assertEqual(channel.code, 200)
self.assertEqual(channel.json_body["og:title"], "\u0434\u043a\u0430")
def test_overlong_title(self):
self.lookups["matrix.org"] = [(IPv4Address, "8.8.8.8")]
end_content = (
b"<html><head>"
b"<title>" + b"x" * 2000 + b"</title>"
b'<meta property="og:description" content="hi" />'
b"</head></html>"
)
request, channel = self.make_request(
"GET", "url_preview?url=http://matrix.org", shorthand=False
)
request.render(self.preview_url)
self.pump()
client = self.reactor.tcpClients[0][2].buildProtocol(None)
server = AccumulatingProtocol()
server.makeConnection(FakeTransport(client, self.reactor))
client.makeConnection(FakeTransport(server, self.reactor))
client.dataReceived(
(
b"HTTP/1.0 200 OK\r\nContent-Length: %d\r\n"
b'Content-Type: text/html; charset="windows-1251"\r\n\r\n'
)
% (len(end_content),)
+ end_content
)
self.pump()
self.assertEqual(channel.code, 200)
res = channel.json_body
# We should only see the `og:description` field, as `title` is too long and should be stripped out
self.assertCountEqual(["og:description"], res.keys())
def test_ipaddr(self):
"""
IP addresses can be previewed directly.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment