From 3c99e9f3c9d2015252d3e2372793721a6703ce62 Mon Sep 17 00:00:00 2001 From: Lesmiscore <nao20010128@gmail.com> Date: Wed, 9 Mar 2022 02:01:05 +0900 Subject: [PATCH] devscripts/instance list: create helper file [skip ci] --- .../workflows/generate-files-automatic.yml | 1 + devscripts/make_mastodon_instance_list.py | 35 ++--------- devscripts/make_misskey_instance_list.py | 32 ++-------- devscripts/make_peertube_instance_list.py | 34 ++--------- devscripts/scraper_helper.py | 61 +++++++++++++++++++ 5 files changed, 79 insertions(+), 84 deletions(-) create mode 100644 devscripts/scraper_helper.py diff --git a/.github/workflows/generate-files-automatic.yml b/.github/workflows/generate-files-automatic.yml index 96d94892ac..953c26be79 100644 --- a/.github/workflows/generate-files-automatic.yml +++ b/.github/workflows/generate-files-automatic.yml @@ -8,6 +8,7 @@ on: - devscripts/make_peertube_instance_list.py - devscripts/make_misskey_version_list.py - devscripts/make_chrome_version_list.py + - devscripts/scraper_helper.py branches: - ytdlp schedule: diff --git a/devscripts/make_mastodon_instance_list.py b/devscripts/make_mastodon_instance_list.py index 3948a17e1d..c32bdcd880 100644 --- a/devscripts/make_mastodon_instance_list.py +++ b/devscripts/make_mastodon_instance_list.py @@ -1,40 +1,20 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals import sys import os import re import itertools +sys.path[:0] = ['.', 'devscripts'] -sys.path[:0] = ['.'] - +from scraper_helper import ie, sanitize_hostname, traverse_sanitize from yt_dlp.utils import ExtractorError, parse_qs -from yt_dlp.extractor.common import InfoExtractor -from test.helper import FakeYDL - - -class TestIE(InfoExtractor): - pass - -ie = TestIE(FakeYDL({ - 'verbose': False, - 'socket_timeout': 120, -})) script_id = 'mastodon' results = set() - -def sanitize_hostname(hostname): - # trim trailing slashes - hostname = re.sub(r'[/\\]+$', '', hostname) - # trim port number - hostname = re.sub(r':\d+$', '', hostname) - return hostname - # Mastodon - instance_social_api_key = os.environ.get('INSTANCE_SOCIAL_API_SECRET') if instance_social_api_key: min_id = None @@ -45,8 +25,7 @@ def sanitize_hostname(hostname): data = ie._download_json( url, script_id, note=f'Paging {min_id}, len(results)={len(results)}', headers={'Authorization': f'Bearer {instance_social_api_key}'}) - for instance in data['instances']: - results.add(sanitize_hostname(instance['name'])) + results.update(traverse_sanitize(data, ('instances', ..., 'name'))) min_id = data['pagination'].get('next_id') if not min_id: break @@ -75,8 +54,7 @@ def sanitize_hostname(hostname): 'content-type': 'application/json, application/graphql', 'accept': 'application/json, application/graphql', }) - for instance in data['data']['nodes']: - results.add(sanitize_hostname(instance['host'])) + results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host'))) except KeyboardInterrupt: raise except BaseException as ex: @@ -138,8 +116,7 @@ def sanitize_hostname(hostname): 'content-type': 'application/json, application/graphql', 'accept': 'application/json, application/graphql', }) - for instance in data['data']['nodes']: - results.add(sanitize_hostname(instance['host'])) + results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host'))) except KeyboardInterrupt: raise except BaseException as ex: diff --git a/devscripts/make_misskey_instance_list.py b/devscripts/make_misskey_instance_list.py index b91155f557..6caf099fdc 100644 --- a/devscripts/make_misskey_instance_list.py +++ b/devscripts/make_misskey_instance_list.py @@ -1,42 +1,21 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals import sys -import re - -sys.path[:0] = ['.'] +sys.path[:0] = ['.', 'devscripts'] +from scraper_helper import ie, traverse_sanitize from yt_dlp.utils import ExtractorError -from yt_dlp.extractor.common import InfoExtractor -from test.helper import FakeYDL - - -class TestIE(InfoExtractor): - pass - -ie = TestIE(FakeYDL({ - 'verbose': False, - 'socket_timeout': 120, -})) script_id = 'misskey' results = set() -def sanitize_hostname(hostname): - # trim trailing slashes - hostname = re.sub(r'[/\\]+$', '', hostname) - # trim port number - hostname = re.sub(r':\d+$', '', hostname) - return hostname - - if True: url = 'https://instanceapp.misskey.page/instances.json' data = ie._download_json( url, script_id, note=f'Scraping join.misskey.page, len(results)={len(results)}') - for instance in data['instancesInfos']: - results.add(sanitize_hostname(instance['url'])) + results.update(traverse_sanitize(data, ('instancesInfos', ..., 'url'))) if True: @@ -48,8 +27,7 @@ def sanitize_hostname(hostname): 'content-type': 'application/json, application/graphql', 'accept': 'application/json, application/graphql', }) - for instance in data['data']['nodes']: - results.add(sanitize_hostname(instance['host'])) + results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host'))) except BaseException: pass diff --git a/devscripts/make_peertube_instance_list.py b/devscripts/make_peertube_instance_list.py index 9e2bee764e..4d7ef42de2 100644 --- a/devscripts/make_peertube_instance_list.py +++ b/devscripts/make_peertube_instance_list.py @@ -1,48 +1,28 @@ # coding: utf-8 -from __future__ import unicode_literals, print_function +from __future__ import unicode_literals import sys import re +sys.path[:0] = ['.', 'devscripts'] -sys.path[:0] = ['.'] - +from scraper_helper import ie, sanitize_hostname, traverse_sanitize from yt_dlp.utils import ExtractorError -from yt_dlp.extractor.common import InfoExtractor -from test.helper import FakeYDL - - -class TestIE(InfoExtractor): - pass - -ie = TestIE(FakeYDL({ - 'verbose': False, - 'socket_timeout': 120, -})) script_id = 'peertube' results = set() -def sanitize_hostname(hostname): - # trim trailing slashes - hostname = re.sub(r'[/\\]+$', '', hostname) - # trim port number - hostname = re.sub(r':\d+$', '', hostname) - return hostname - - begin, page_size = 0, 10 while True: url = 'https://instances.joinpeertube.org/api/v1/instances?start=%d&count=%d&sort=-createdAt' % (begin, page_size) data = ie._download_json( url, script_id, note=f'Paging https://instances.joinpeertube.org {begin}, len(results)={len(results)}') - for instance in data['data']: - results.add(sanitize_hostname(instance['host'])) + results.update(traverse_sanitize(data, ('data', ..., 'host'))) begin += page_size if not data['data']: break -while True: +if True: try: url = 'https://the-federation.info/graphql?query=query%20Platform(%24name%3A%20String!)%20%7B%0A%20%20platforms(name%3A%20%24name)%20%7B%0A%20%20%20%20name%0A%20%20%20%20code%0A%20%20%20%20displayName%0A%20%20%20%20description%0A%20%20%20%20tagline%0A%20%20%20%20website%0A%20%20%20%20icon%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20nodes(platform%3A%20%24name)%20%7B%0A%20%20%20%20id%0A%20%20%20%20name%0A%20%20%20%20version%0A%20%20%20%20openSignups%0A%20%20%20%20host%0A%20%20%20%20platform%20%7B%0A%20%20%20%20%20%20name%0A%20%20%20%20%20%20icon%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20countryCode%0A%20%20%20%20countryFlag%0A%20%20%20%20countryName%0A%20%20%20%20services%20%7B%0A%20%20%20%20%20%20name%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20statsGlobalToday(platform%3A%20%24name)%20%7B%0A%20%20%20%20usersTotal%0A%20%20%20%20usersHalfYear%0A%20%20%20%20usersMonthly%0A%20%20%20%20localPosts%0A%20%20%20%20localComments%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20statsNodes(platform%3A%20%24name)%20%7B%0A%20%20%20%20node%20%7B%0A%20%20%20%20%20%20id%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20usersTotal%0A%20%20%20%20usersHalfYear%0A%20%20%20%20usersMonthly%0A%20%20%20%20localPosts%0A%20%20%20%20localComments%0A%20%20%20%20__typename%0A%20%20%7D%0A%7D%0A&operationName=Platform&variables=%7B%22name%22%3A%22peertube%22%7D' data = ie._download_json( @@ -51,9 +31,7 @@ def sanitize_hostname(hostname): 'content-type': 'application/json, application/graphql', 'accept': 'application/json, application/graphql', }) - for instance in data['data']['nodes']: - results.add(sanitize_hostname(instance['host'])) - break + results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host'))) except BaseException as ex: ie.report_warning(ex) diff --git a/devscripts/scraper_helper.py b/devscripts/scraper_helper.py new file mode 100644 index 0000000000..c80f23753a --- /dev/null +++ b/devscripts/scraper_helper.py @@ -0,0 +1,61 @@ +# coding: utf-8 +from __future__ import unicode_literals + +import re + +from yt_dlp.utils import ExtractorError, traverse_obj +from yt_dlp.extractor.common import InfoExtractor +from test.helper import FakeYDL + + +class TestIE(InfoExtractor): + pass + + +ie = TestIE(FakeYDL({ + 'verbose': False, + 'socket_timeout': 120, +})) + +orig_dwh = ie._download_webpage_handle + +dwh_args = orig_dwh.__code__.co_varnames[1:orig_dwh.__code__.co_argcount] + + +def retry_download_webpage_handle(*args, **kw): + kw = { + **dict(zip(dwh_args, args)), + **kw, + } + retries = 3 + fatal = kw.get('fatal', True) + note = kw.get('note') + kw['fatal'] = True + for i in range(retries + 1): + try: + if i: + kw['note'] = f'{note} (retry {i} of {retries})' + return orig_dwh(**kw) + except ExtractorError as e: + if retries == i + 1: + if fatal: + raise + else: + ie.report_warning(e) + break + ie.report_warning(f'{e} Retrying...') + + +ie._download_webpage_handle = retry_download_webpage_handle + + +def sanitize_hostname(hostname): + # trim trailing slashes + hostname = re.sub(r'[/\\]+$', '', hostname) + # trim port number + hostname = re.sub(r':\d+$', '', hostname) + return hostname + + +def traverse_sanitize(*arg, **kw): + return map(sanitize_hostname, traverse_obj(*arg, **kw) or []) -- GitLab