Skip to content
Snippets Groups Projects
Commit 3c99e9f3 authored by Lesmiscore's avatar Lesmiscore
Browse files

devscripts/instance list: create helper file [skip ci]

parent f1693c59
No related branches found
No related tags found
No related merge requests found
Pipeline #8331 skipped
......@@ -8,6 +8,7 @@ on:
- devscripts/make_peertube_instance_list.py
- devscripts/make_misskey_version_list.py
- devscripts/make_chrome_version_list.py
- devscripts/scraper_helper.py
branches:
- ytdlp
schedule:
......
# coding: utf-8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
import sys
import os
import re
import itertools
sys.path[:0] = ['.', 'devscripts']
sys.path[:0] = ['.']
from scraper_helper import ie, sanitize_hostname, traverse_sanitize
from yt_dlp.utils import ExtractorError, parse_qs
from yt_dlp.extractor.common import InfoExtractor
from test.helper import FakeYDL
class TestIE(InfoExtractor):
pass
ie = TestIE(FakeYDL({
'verbose': False,
'socket_timeout': 120,
}))
script_id = 'mastodon'
results = set()
def sanitize_hostname(hostname):
# trim trailing slashes
hostname = re.sub(r'[/\\]+$', '', hostname)
# trim port number
hostname = re.sub(r':\d+$', '', hostname)
return hostname
# Mastodon
instance_social_api_key = os.environ.get('INSTANCE_SOCIAL_API_SECRET')
if instance_social_api_key:
min_id = None
......@@ -45,8 +25,7 @@ def sanitize_hostname(hostname):
data = ie._download_json(
url, script_id, note=f'Paging {min_id}, len(results)={len(results)}',
headers={'Authorization': f'Bearer {instance_social_api_key}'})
for instance in data['instances']:
results.add(sanitize_hostname(instance['name']))
results.update(traverse_sanitize(data, ('instances', ..., 'name')))
min_id = data['pagination'].get('next_id')
if not min_id:
break
......@@ -75,8 +54,7 @@ def sanitize_hostname(hostname):
'content-type': 'application/json, application/graphql',
'accept': 'application/json, application/graphql',
})
for instance in data['data']['nodes']:
results.add(sanitize_hostname(instance['host']))
results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
except KeyboardInterrupt:
raise
except BaseException as ex:
......@@ -138,8 +116,7 @@ def sanitize_hostname(hostname):
'content-type': 'application/json, application/graphql',
'accept': 'application/json, application/graphql',
})
for instance in data['data']['nodes']:
results.add(sanitize_hostname(instance['host']))
results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
except KeyboardInterrupt:
raise
except BaseException as ex:
......
# coding: utf-8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
import sys
import re
sys.path[:0] = ['.']
sys.path[:0] = ['.', 'devscripts']
from scraper_helper import ie, traverse_sanitize
from yt_dlp.utils import ExtractorError
from yt_dlp.extractor.common import InfoExtractor
from test.helper import FakeYDL
class TestIE(InfoExtractor):
pass
ie = TestIE(FakeYDL({
'verbose': False,
'socket_timeout': 120,
}))
script_id = 'misskey'
results = set()
def sanitize_hostname(hostname):
# trim trailing slashes
hostname = re.sub(r'[/\\]+$', '', hostname)
# trim port number
hostname = re.sub(r':\d+$', '', hostname)
return hostname
if True:
url = 'https://instanceapp.misskey.page/instances.json'
data = ie._download_json(
url, script_id, note=f'Scraping join.misskey.page, len(results)={len(results)}')
for instance in data['instancesInfos']:
results.add(sanitize_hostname(instance['url']))
results.update(traverse_sanitize(data, ('instancesInfos', ..., 'url')))
if True:
......@@ -48,8 +27,7 @@ def sanitize_hostname(hostname):
'content-type': 'application/json, application/graphql',
'accept': 'application/json, application/graphql',
})
for instance in data['data']['nodes']:
results.add(sanitize_hostname(instance['host']))
results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
except BaseException:
pass
......
# coding: utf-8
from __future__ import unicode_literals, print_function
from __future__ import unicode_literals
import sys
import re
sys.path[:0] = ['.', 'devscripts']
sys.path[:0] = ['.']
from scraper_helper import ie, sanitize_hostname, traverse_sanitize
from yt_dlp.utils import ExtractorError
from yt_dlp.extractor.common import InfoExtractor
from test.helper import FakeYDL
class TestIE(InfoExtractor):
pass
ie = TestIE(FakeYDL({
'verbose': False,
'socket_timeout': 120,
}))
script_id = 'peertube'
results = set()
def sanitize_hostname(hostname):
# trim trailing slashes
hostname = re.sub(r'[/\\]+$', '', hostname)
# trim port number
hostname = re.sub(r':\d+$', '', hostname)
return hostname
begin, page_size = 0, 10
while True:
url = 'https://instances.joinpeertube.org/api/v1/instances?start=%d&count=%d&sort=-createdAt' % (begin, page_size)
data = ie._download_json(
url, script_id, note=f'Paging https://instances.joinpeertube.org {begin}, len(results)={len(results)}')
for instance in data['data']:
results.add(sanitize_hostname(instance['host']))
results.update(traverse_sanitize(data, ('data', ..., 'host')))
begin += page_size
if not data['data']:
break
while True:
if True:
try:
url = 'https://the-federation.info/graphql?query=query%20Platform(%24name%3A%20String!)%20%7B%0A%20%20platforms(name%3A%20%24name)%20%7B%0A%20%20%20%20name%0A%20%20%20%20code%0A%20%20%20%20displayName%0A%20%20%20%20description%0A%20%20%20%20tagline%0A%20%20%20%20website%0A%20%20%20%20icon%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20nodes(platform%3A%20%24name)%20%7B%0A%20%20%20%20id%0A%20%20%20%20name%0A%20%20%20%20version%0A%20%20%20%20openSignups%0A%20%20%20%20host%0A%20%20%20%20platform%20%7B%0A%20%20%20%20%20%20name%0A%20%20%20%20%20%20icon%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20countryCode%0A%20%20%20%20countryFlag%0A%20%20%20%20countryName%0A%20%20%20%20services%20%7B%0A%20%20%20%20%20%20name%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20statsGlobalToday(platform%3A%20%24name)%20%7B%0A%20%20%20%20usersTotal%0A%20%20%20%20usersHalfYear%0A%20%20%20%20usersMonthly%0A%20%20%20%20localPosts%0A%20%20%20%20localComments%0A%20%20%20%20__typename%0A%20%20%7D%0A%20%20statsNodes(platform%3A%20%24name)%20%7B%0A%20%20%20%20node%20%7B%0A%20%20%20%20%20%20id%0A%20%20%20%20%20%20__typename%0A%20%20%20%20%7D%0A%20%20%20%20usersTotal%0A%20%20%20%20usersHalfYear%0A%20%20%20%20usersMonthly%0A%20%20%20%20localPosts%0A%20%20%20%20localComments%0A%20%20%20%20__typename%0A%20%20%7D%0A%7D%0A&operationName=Platform&variables=%7B%22name%22%3A%22peertube%22%7D'
data = ie._download_json(
......@@ -51,9 +31,7 @@ def sanitize_hostname(hostname):
'content-type': 'application/json, application/graphql',
'accept': 'application/json, application/graphql',
})
for instance in data['data']['nodes']:
results.add(sanitize_hostname(instance['host']))
break
results.update(traverse_sanitize(data, ('data', 'nodes', ..., 'host')))
except BaseException as ex:
ie.report_warning(ex)
......
# coding: utf-8
from __future__ import unicode_literals
import re
from yt_dlp.utils import ExtractorError, traverse_obj
from yt_dlp.extractor.common import InfoExtractor
from test.helper import FakeYDL
class TestIE(InfoExtractor):
pass
ie = TestIE(FakeYDL({
'verbose': False,
'socket_timeout': 120,
}))
orig_dwh = ie._download_webpage_handle
dwh_args = orig_dwh.__code__.co_varnames[1:orig_dwh.__code__.co_argcount]
def retry_download_webpage_handle(*args, **kw):
kw = {
**dict(zip(dwh_args, args)),
**kw,
}
retries = 3
fatal = kw.get('fatal', True)
note = kw.get('note')
kw['fatal'] = True
for i in range(retries + 1):
try:
if i:
kw['note'] = f'{note} (retry {i} of {retries})'
return orig_dwh(**kw)
except ExtractorError as e:
if retries == i + 1:
if fatal:
raise
else:
ie.report_warning(e)
break
ie.report_warning(f'{e} Retrying...')
ie._download_webpage_handle = retry_download_webpage_handle
def sanitize_hostname(hostname):
# trim trailing slashes
hostname = re.sub(r'[/\\]+$', '', hostname)
# trim port number
hostname = re.sub(r':\d+$', '', hostname)
return hostname
def traverse_sanitize(*arg, **kw):
return map(sanitize_hostname, traverse_obj(*arg, **kw) or [])
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment