diff options
author | Luke Shumaker <lukeshu@parabola.nu> | 2017-11-05 10:56:30 -0500 |
---|---|---|
committer | Luke Shumaker <lukeshu@parabola.nu> | 2019-03-10 01:08:41 -0500 |
commit | 7e4e5a1a1855016f527dd0fe7c748b87e8e6e4a8 (patch) | |
tree | 6f6fa6bf214cf749ddbbc9abcc84409f91ade55e | |
parent | 7b3c30691f8ff901b8ddb32b17d6894015b68ef7 (diff) |
command: syncisos: Update HTML parser to work with repoindex
First of all: the regex was broken. The existing check would have been
better written as:
- self.url_re = re.compile('(?!\.{2})/$')
...
- if value != '../' and self.url_re.search(value) is not None:
+ if value != '../' and value.endswith('/'):
Anyway, reduce the nesting a bit, and revise the check to filter out the
extra links that repoindex includes.
-rw-r--r-- | releng/management/commands/syncisos.py | 15 |
1 files changed, 9 insertions, 6 deletions
diff --git a/releng/management/commands/syncisos.py b/releng/management/commands/syncisos.py index f182cc33..060ab2c1 100644 --- a/releng/management/commands/syncisos.py +++ b/releng/management/commands/syncisos.py @@ -14,14 +14,17 @@ class IsoListParser(HTMLParser): HTMLParser.__init__(self) self.hyperlinks = [] - self.url_re = re.compile('(?!\.{2})/$') def handle_starttag(self, tag, attrs): - if tag == 'a': - for name, value in attrs: - if name == "href": - if value != '../' and self.url_re.search(value) is not None: - self.hyperlinks.append(value[:-1]) + if tag != 'a': + return + + for name, value in attrs: + if name != "href": + continue + + if value.endswith('/') and value != '../' and '/' not in value[:-1] and len(value[:-1]) > 0: + self.hyperlinks.append(value[:-1]) def parse(self, url): try: |