summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorLuke Shumaker <lukeshu@parabola.nu>2017-11-05 10:56:30 -0500
committerLuke Shumaker <lukeshu@parabola.nu>2019-03-10 01:08:41 -0500
commit7e4e5a1a1855016f527dd0fe7c748b87e8e6e4a8 (patch)
tree6f6fa6bf214cf749ddbbc9abcc84409f91ade55e
parent7b3c30691f8ff901b8ddb32b17d6894015b68ef7 (diff)
command: syncisos: Update HTML parser to work with repoindex
First of all: the regex was broken. The existing check would have been better written as: - self.url_re = re.compile('(?!\.{2})/$') ... - if value != '../' and self.url_re.search(value) is not None: + if value != '../' and value.endswith('/'): Anyway, reduce the nesting a bit, and revise the check to filter out the extra links that repoindex includes.
-rw-r--r--releng/management/commands/syncisos.py15
1 files changed, 9 insertions, 6 deletions
diff --git a/releng/management/commands/syncisos.py b/releng/management/commands/syncisos.py
index f182cc33..060ab2c1 100644
--- a/releng/management/commands/syncisos.py
+++ b/releng/management/commands/syncisos.py
@@ -14,14 +14,17 @@ class IsoListParser(HTMLParser):
HTMLParser.__init__(self)
self.hyperlinks = []
- self.url_re = re.compile('(?!\.{2})/$')
def handle_starttag(self, tag, attrs):
- if tag == 'a':
- for name, value in attrs:
- if name == "href":
- if value != '../' and self.url_re.search(value) is not None:
- self.hyperlinks.append(value[:-1])
+ if tag != 'a':
+ return
+
+ for name, value in attrs:
+ if name != "href":
+ continue
+
+ if value.endswith('/') and value != '../' and '/' not in value[:-1] and len(value[:-1]) > 0:
+ self.hyperlinks.append(value[:-1])
def parse(self, url):
try: