diff options
author | Luke Shumaker <lukeshu@parabola.nu> | 2017-11-05 10:56:30 -0500 |
---|---|---|
committer | Luke Shumaker <lukeshu@parabola.nu> | 2017-11-05 10:56:30 -0500 |
commit | eb42c9bafb26e0d214447f03ef122ae8ce605dbd (patch) | |
tree | 0ba739272c4f0c917df11066926c7ca0c3eecad9 | |
parent | 2f35bd492629a52f4849c379f02030c1efb2495c (diff) |
command: syncisos: Update HTML parser to work with repoindexparabolaweb-2017-11-07
First of all: the regex was broken. The existing check would have been
better written as:
- self.url_re = re.compile('(?!\.{2})/$')
...
- if value != '../' and self.url_re.search(value) is not None:
+ if value != '../' and value.endswith('/'):
Anyway, reduce the nesting a bit, and revise the check to filter out the
extra links that repoindex includes.
-rw-r--r-- | releng/management/commands/syncisos.py | 15 |
1 files changed, 9 insertions, 6 deletions
diff --git a/releng/management/commands/syncisos.py b/releng/management/commands/syncisos.py index f182cc33..060ab2c1 100644 --- a/releng/management/commands/syncisos.py +++ b/releng/management/commands/syncisos.py @@ -14,14 +14,17 @@ class IsoListParser(HTMLParser): HTMLParser.__init__(self) self.hyperlinks = [] - self.url_re = re.compile('(?!\.{2})/$') def handle_starttag(self, tag, attrs): - if tag == 'a': - for name, value in attrs: - if name == "href": - if value != '../' and self.url_re.search(value) is not None: - self.hyperlinks.append(value[:-1]) + if tag != 'a': + return + + for name, value in attrs: + if name != "href": + continue + + if value.endswith('/') and value != '../' and '/' not in value[:-1] and len(value[:-1]) > 0: + self.hyperlinks.append(value[:-1]) def parse(self, url): try: |