From 3c99e9f3c9d2015252d3e2372793721a6703ce62 Mon Sep 17 00:00:00 2001
From: Lesmiscore <nao20010128@gmail.com>
Date: Wed, 9 Mar 2022 02:01:05 +0900
Subject: [PATCH] devscripts/instance list: create helper file [skip ci]

---
 .../workflows/generate-files-automatic.yml    |  1 +
 devscripts/make_mastodon_instance_list.py     | 35 ++---------
 devscripts/make_misskey_instance_list.py      | 32 ++--------
 devscripts/make_peertube_instance_list.py     | 34 ++---------
 devscripts/scraper_helper.py                  | 61 +++++++++++++++++++
 5 files changed, 79 insertions(+), 84 deletions(-)
 create mode 100644 devscripts/scraper_helper.py

diff --git a/.github/workflows/generate-files-automatic.yml b/.github/workflows/generate-files-automatic.yml
index 96d94892ac..953c26be79 100644
--- a/.github/workflows/generate-files-automatic.yml
+++ b/.github/workflows/generate-files-automatic.yml
@@ -8,6 +8,7 @@ on:
       - devscripts/make_peertube_instance_list.py
       - devscripts/make_misskey_version_list.py
       - devscripts/make_chrome_version_list.py
+      - devscripts/scraper_helper.py
     branches:
       - ytdlp
   schedule:
diff --git a/devscripts/make_mastodon_instance_list.py b/devscripts/make_mastodon_instance_list.py
index 3948a17e1d..c32bdcd880 100644
--- a/devscripts/make_mastodon_instance_list.py
+++ b/devscripts/make_mastodon_instance_list.py
@@ -1,40 +1,20 @@
 # coding: utf-8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals
 
 import sys
 import os
 import re
 import itertools
+sys.path[:0] = ['.', 'devscripts']
 
-sys.path[:0] = ['.']
-
+from scraper_helper import ie, sanitize_hostname, traverse_sanitize
 from yt_dlp.utils import ExtractorError, parse_qs
-from yt_dlp.extractor.common import InfoExtractor
-from test.helper import FakeYDL
-
-
-class TestIE(InfoExtractor):
-    pass
 
-
-ie = TestIE(FakeYDL({
-    'verbose': False,
-    'socket_timeout': 120,
-}))
 script_id = 'mastodon'
 results = set()
 
-
-def sanitize_hostname(hostname):
-    # trim trailing slashes
-    hostname = re.sub(r'[/\\]+$', '', hostname)
-    # trim port number
-    hostname = re.sub(r':\d+$', '', hostname)
-    return hostname
-
 # Mastodon
 
-
 instance_social_api_key = os.environ.get('INSTANCE_SOCIAL_API_SECRET')
 if instance_social_api_key:
     min_id = None
@@ -45,8 +25,7 @@ def sanitize_hostname(hostname):
         data = ie._download_json(
             url, script_id, note=f'Paging {min_id}, len(results)={len(results)}',
             headers={'Authorization': f'Bearer {instance_social_api_key}'})
-        for instance in data['instances']:
-            results.add(sanitize_hostname(instance['name']))
+        results.update(traverse_sanitize(data, ('instances', ..., 'name')))
         min_id = data['pagination'].get('next_id')
         if not min_id:
             break
@@ -75,8 +54,7 @@ def sanitize_hostname(hostname):
                 'content-type': 'application/json, application/graphql',
                 'accept': 'application/json, application/graphql',
             })
-        for instance in data['data']['nodes']:
-            results.add(sanitize_hostname(instance['host']))
+        results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
     except KeyboardInterrupt:
         raise
     except BaseException as ex:
@@ -138,8 +116,7 @@ def sanitize_hostname(hostname):
                 'content-type': 'application/json, application/graphql',
                 'accept': 'application/json, application/graphql',
             })
-        for instance in data['data']['nodes']:
-            results.add(sanitize_hostname(instance['host']))
+        results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
     except KeyboardInterrupt:
         raise
     except BaseException as ex:
diff --git a/devscripts/make_misskey_instance_list.py b/devscripts/make_misskey_instance_list.py
index b91155f557..6caf099fdc 100644
--- a/devscripts/make_misskey_instance_list.py
+++ b/devscripts/make_misskey_instance_list.py
@@ -1,42 +1,21 @@
 # coding: utf-8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals
 
 import sys
-import re
-
-sys.path[:0] = ['.']
+sys.path[:0] = ['.', 'devscripts']
 
+from scraper_helper import ie, traverse_sanitize
 from yt_dlp.utils import ExtractorError
-from yt_dlp.extractor.common import InfoExtractor
-from test.helper import FakeYDL
-
-
-class TestIE(InfoExtractor):
-    pass
 
-
-ie = TestIE(FakeYDL({
-    'verbose': False,
-    'socket_timeout': 120,
-}))
 script_id = 'misskey'
 results = set()
 
 
-def sanitize_hostname(hostname):
-    # trim trailing slashes
-    hostname = re.sub(r'[/\\]+$', '', hostname)
-    # trim port number
-    hostname = re.sub(r':\d+$', '', hostname)
-    return hostname
-
-
 if True:
     url = 'https://instanceapp.misskey.page/instances.json'
     data = ie._download_json(
         url, script_id, note=f'Scraping join.misskey.page, len(results)={len(results)}')
-    for instance in data['instancesInfos']:
-        results.add(sanitize_hostname(instance['url']))
+    results.update(traverse_sanitize(data, ('instancesInfos', ..., 'url')))
 
 
 if True:
@@ -48,8 +27,7 @@ def sanitize_hostname(hostname):
                 'content-type': 'application/json, application/graphql',
                 'accept': 'application/json, application/graphql',
             })
-        for instance in data['data']['nodes']:
-            results.add(sanitize_hostname(instance['host']))
+        results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
     except BaseException:
         pass
 
diff --git a/devscripts/make_peertube_instance_list.py b/devscripts/make_peertube_instance_list.py
index 9e2bee764e..4d7ef42de2 100644
--- a/devscripts/make_peertube_instance_list.py
+++ b/devscripts/make_peertube_instance_list.py
@@ -1,48 +1,28 @@
 # coding: utf-8
-from __future__ import unicode_literals, print_function
+from __future__ import unicode_literals
 
 import sys
 import re
+sys.path[:0] = ['.', 'devscripts']
 
-sys.path[:0] = ['.']
-
+from scraper_helper import ie, sanitize_hostname, traverse_sanitize
 from yt_dlp.utils import ExtractorError
-from yt_dlp.extractor.common import InfoExtractor
-from test.helper import FakeYDL
-
-
-class TestIE(InfoExtractor):
-    pass
-
 
-ie = TestIE(FakeYDL({
-    'verbose': False,
-    'socket_timeout': 120,
-}))
 script_id = 'peertube'
 results = set()
 
 
-def sanitize_hostname(hostname):
-    # trim trailing slashes
-    hostname = re.sub(r'[/\\]+$', '', hostname)
-    # trim port number
-    hostname = re.sub(r':\d+$', '', hostname)
-    return hostname
-
-
 begin, page_size = 0, 10
 while True:
     url = 'https://instances.joinpeertube.org/api/v1/instances?start=%d&count=%d&sort=-createdAt' % (begin, page_size)
     data = ie._download_json(
         url, script_id, note=f'Paging https://instances.joinpeertube.org {begin}, len(results)={len(results)}')
-    for instance in data['data']:
-        results.add(sanitize_hostname(instance['host']))
+    results.update(traverse_sanitize(data, ('data', ..., 'host')))
     begin += page_size
     if not data['data']:
         break
 
-while True:
+if True:
     try:
         url = 'https://the-federation.info/graphql?query=query%20Platform(%24name%3A%20String!)%20%7B%0A%20%20platforms(name%3A%20%24name)%20%7B%0A%20%20%20%20name%0A%20%20%20%20code%0A%20%20%20%20displayName%0A%20%20%20%20description%0A%20%20%20%20tagline%0A%20%20%20%20website%0A%20%20%20%20icon%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20nodes(platform%3A%20%24name)%20%7B%0A%20%20%20%20id%0A%20%20%20%20name%0A%20%20%20%20version%0A%20%20%20%20openSignups%0A%20%20%20%20host%0A%20%20%20%20platform%20%7B%0A%20%20%20%20%20%20name%0A%20%20%20%20%20%20icon%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20countryCode%0A%20%20%20%20countryFlag%0A%20%20%20%20countryName%0A%20%20%20%20services%20%7B%0A%20%20%20%20%20%20name%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20statsGlobalToday(platform%3A%20%24name)%20%7B%0A%20%20%20%20usersTotal%0A%20%20%20%20usersHalfYear%0A%20%20%20%20usersMonthly%0A%20%20%20%20localPosts%0A%20%20%20%20localComments%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20statsNodes(platform%3A%20%24name)%20%7B%0A%20%20%20%20node%20%7B%0A%20%20%20%20%20%20id%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20usersTotal%0A%20%20%20%20usersHalfYear%0A%20%20%20%20usersMonthly%0A%20%20%20%20localPosts%0A%20%20%20%20localComments%0A%20%20%20%20__typename%0A%20%20%7D%0A%7D%0A&operationName=Platform&variables=%7B%22name%22%3A%22peertube%22%7D'
         data = ie._download_json(
@@ -51,9 +31,7 @@ def sanitize_hostname(hostname):
                 'content-type': 'application/json, application/graphql',
                 'accept': 'application/json, application/graphql',
             })
-        for instance in data['data']['nodes']:
-            results.add(sanitize_hostname(instance['host']))
-        break
+        results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
     except BaseException as ex:
         ie.report_warning(ex)
 
diff --git a/devscripts/scraper_helper.py b/devscripts/scraper_helper.py
new file mode 100644
index 0000000000..c80f23753a
--- /dev/null
+++ b/devscripts/scraper_helper.py
@@ -0,0 +1,61 @@
+# coding: utf-8
+from __future__ import unicode_literals
+
+import re
+
+from yt_dlp.utils import ExtractorError, traverse_obj
+from yt_dlp.extractor.common import InfoExtractor
+from test.helper import FakeYDL
+
+
+class TestIE(InfoExtractor):
+    pass
+
+
+ie = TestIE(FakeYDL({
+    'verbose': False,
+    'socket_timeout': 120,
+}))
+
+orig_dwh = ie._download_webpage_handle
+
+dwh_args = orig_dwh.__code__.co_varnames[1:orig_dwh.__code__.co_argcount]
+
+
+def retry_download_webpage_handle(*args, **kw):
+    kw = {
+        **dict(zip(dwh_args, args)),
+        **kw,
+    }
+    retries = 3
+    fatal = kw.get('fatal', True)
+    note = kw.get('note')
+    kw['fatal'] = True
+    for i in range(retries + 1):
+        try:
+            if i:
+                kw['note'] = f'{note} (retry {i} of {retries})'
+            return orig_dwh(**kw)
+        except ExtractorError as e:
+            if retries == i + 1:
+                if fatal:
+                    raise
+                else:
+                    ie.report_warning(e)
+                    break
+            ie.report_warning(f'{e} Retrying...')
+
+
+ie._download_webpage_handle = retry_download_webpage_handle
+
+
+def sanitize_hostname(hostname):
+    # trim trailing slashes
+    hostname = re.sub(r'[/\\]+$', '', hostname)
+    # trim port number
+    hostname = re.sub(r':\d+$', '', hostname)
+    return hostname
+
+
+def traverse_sanitize(*arg, **kw):
+    return map(sanitize_hostname, traverse_obj(*arg, **kw) or [])
-- 
GitLab