diff options
Diffstat (limited to 'libre/iceweasel/process-json-files.py')
-rw-r--r-- | libre/iceweasel/process-json-files.py | 252 |
1 files changed, 252 insertions, 0 deletions
diff --git a/libre/iceweasel/process-json-files.py b/libre/iceweasel/process-json-files.py new file mode 100644 index 000000000..e29013560 --- /dev/null +++ b/libre/iceweasel/process-json-files.py @@ -0,0 +1,252 @@ +#! /usr/bin/python3 + +# Copyright (C) 2020, 2021, 2022, 2023 grizzlyuser <grizzlyuser@protonmail.com> +# Based on: https://gitlab.trisquel.org/trisquel/wrapage-helpers/-/blob/81881d89b2bf7d502dd14fcccdb471fec6f6b206/helpers/DATA/firefox/reprocess-search-config.py +# Below is the notice from the original author: +# +# Copyright (C) 2020 Ruben Rodriguez <ruben@trisquel.info> +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + +import json +import sys +import time +import copy +import argparse +import pathlib +from collections import namedtuple +from jsonschema import validate + +parser = argparse.ArgumentParser() +parser.add_argument( + 'MAIN_PATH', + type=pathlib.Path, + help='path to main application source code directory') +parser.add_argument( + 'BRANDING_PATH', + type=pathlib.Path, + help='path to branding source code directory') +parser.add_argument( + '-i', + '--indent', + type=int, + default=2, + help='indent for pretty printing of output files') +arguments = parser.parse_args() + +File = namedtuple('File', ['path', 'content']) + + +class RemoteSettings: + DUMPS_PATH_RELATIVE = 'services/settings/dumps' + DUMPS_PATH_ABSOLUTE = arguments.MAIN_PATH / DUMPS_PATH_RELATIVE + + _WRAPPER_NAME = 'data' + _LAST_MODIFIED_KEY_NAME = 'last_modified' + + @classmethod + def get_collection_timestamp(cls, collection): + return max((record[cls._LAST_MODIFIED_KEY_NAME] + for record in collection.content), default=0) + + @classmethod + def wrap(cls, processed): + return File(processed.path, + {cls._WRAPPER_NAME: processed.content, + 'timestamp': cls.get_collection_timestamp(processed)}) + + @classmethod + def unwrap(cls, parsed_jsons): + return [File(json.path, json.content[cls._WRAPPER_NAME]) + for json in parsed_jsons] + + @classmethod + def should_modify_collection(cls, collection): + return True + + @classmethod + def now(cls): + return int(round(time.time_ns() / 10 ** 6)) + + @classmethod + def process_raw(cls, unwrapped_jsons, parsed_schema): + timestamps, result = [], [] + + for collection in unwrapped_jsons: + should_modify_collection = cls.should_modify_collection(collection) + for record in collection.content: + if should_modify_collection: + if cls.should_drop_record(record): + continue + + clone = copy.deepcopy(record) + + record = cls.process_record(record) + + if clone != record: + timestamp = cls.now() + while timestamp in timestamps: + timestamp += 1 + timestamps.append(timestamp) + record[cls._LAST_MODIFIED_KEY_NAME] = timestamp + + if parsed_schema is not None: + validate(record, schema=parsed_schema) + + result.append(record) + + result.sort( + key=lambda record: record[cls._LAST_MODIFIED_KEY_NAME], reverse=True) + cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) + + return File(cls.OUTPUT_PATH, result) + + @classmethod + def process(cls, parsed_jsons, parsed_schema): + return cls.wrap( + cls.process_raw( + cls.unwrap(parsed_jsons), + parsed_schema)) + + +class Changes(RemoteSettings): + JSON_PATHS = tuple(RemoteSettings.DUMPS_PATH_ABSOLUTE.glob('*/*.json')) + OUTPUT_PATH = RemoteSettings.DUMPS_PATH_ABSOLUTE / 'monitor/changes' + + @classmethod + def wrap(cls, processed): + return File( + processed.path, { + 'changes': processed.content, 'timestamp': cls.now()}) + + @classmethod + def process_raw(cls, unwrapped_jsons, parsed_schema): + changes = [] + + for collection in unwrapped_jsons: + if collection.path not in (RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/example.json', RemoteSettings.DUMPS_PATH_ABSOLUTE / 'main/search-config-v2.json'): + latest_change = {} + latest_change[cls._LAST_MODIFIED_KEY_NAME] = cls.get_collection_timestamp( + collection) + latest_change['bucket'] = collection.path.parent.name + latest_change['collection'] = collection.path.stem + changes.append(latest_change) + + cls.OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True) + + return File(cls.OUTPUT_PATH, changes) + + +class SearchConfig(RemoteSettings): + JSON_PATHS = ( + RemoteSettings.DUMPS_PATH_ABSOLUTE / + 'main/search-config.json', + ) + SCHEMA_PATH = arguments.MAIN_PATH / \ + 'toolkit/components/search/schema/search-config-schema.json' + OUTPUT_PATH = JSON_PATHS[0] + + _DUCKDUCKGO_SEARCH_ENGINE_ID = 'ddg@search.mozilla.org' + + @classmethod + def should_drop_record(cls, search_engine): + return search_engine['webExtension']['id'] not in ( + cls._DUCKDUCKGO_SEARCH_ENGINE_ID, 'wikipedia@search.mozilla.org') + + @classmethod + def process_record(cls, search_engine): + [search_engine.pop(key, None) + for key in ['extraParams', 'telemetryId']] + + general_specifier = {} + for specifier in search_engine['appliesTo'].copy(): + if 'application' in specifier: + search_engine['appliesTo'].remove(specifier) + continue + + if 'included' in specifier and 'everywhere' in specifier[ + 'included'] and specifier['included']['everywhere']: + general_specifier = specifier + + if not general_specifier: + general_specifier = {'included': {'everywhere': True}} + search_engine['appliesTo'].insert(0, general_specifier) + + if search_engine['webExtension']['id'] == cls._DUCKDUCKGO_SEARCH_ENGINE_ID: + general_specifier['default'] = 'yes' + search_engine.pop('params') + + return search_engine + + +class TippyTopSites: + JSON_PATHS = ( + arguments.MAIN_PATH / + 'browser/components/newtab/data/content/tippytop/top_sites.json', + arguments.BRANDING_PATH / + 'tippytop/top_sites.json') + + @classmethod + def process(cls, parsed_jsons, parsed_schema): + tippy_top_sites_main = parsed_jsons[0] + tippy_top_sites_branding = parsed_jsons[1] + result = tippy_top_sites_branding.content + \ + [site for site in tippy_top_sites_main.content if 'wikipedia.org' in site['domains']] + return File(tippy_top_sites_main.path, result) + + +class TopSites(RemoteSettings): + _TOP_SITES_JSON_PATH = 'main/top-sites.json' + _TOP_SITES_PATH_MAIN = RemoteSettings.DUMPS_PATH_ABSOLUTE / _TOP_SITES_JSON_PATH + + JSON_PATHS = ( + arguments.BRANDING_PATH / + RemoteSettings.DUMPS_PATH_RELATIVE / + _TOP_SITES_JSON_PATH, + _TOP_SITES_PATH_MAIN) + OUTPUT_PATH = _TOP_SITES_PATH_MAIN + + @classmethod + def should_modify_collection(cls, collection): + return cls._TOP_SITES_PATH_MAIN == collection.path + + @classmethod + def should_drop_record(cls, site): + return site['url'] != 'https://www.wikipedia.org/' + + @classmethod + def process_record(cls, site): + site.pop('exclude_regions', None) + return site + + +# To reflect the latest timestamps, Changes class should always come after +# all other RemoteSettings subclasses +processors = (TippyTopSites, SearchConfig, TopSites, Changes) + +for processor in processors: + parsed_jsons = [] + for json_path in processor.JSON_PATHS: + with json_path.open() as file: + parsed_jsons.append(File(json_path, json.load(file))) + + parsed_schema = None + if hasattr(processor, "SCHEMA_PATH"): + with processor.SCHEMA_PATH.open() as file: + parsed_schema = json.load(file) + + processed = processor.process(parsed_jsons, parsed_schema) + with processed.path.open('w') as file: + json.dump(processed.content, file, indent=arguments.indent) |