diff options
author | rubenwardy <rw@rubenwardy.com> | 2018-07-04 00:14:37 +0100 |
---|---|---|
committer | rubenwardy <rw@rubenwardy.com> | 2018-07-04 00:38:51 +0100 |
commit | 19e1ed8b32179e3317c807b3ab0581e3b5fb00a2 (patch) | |
tree | 3e3409482aa7a0f8c8da8721df26350f64a00f6a /app/tasks/phpbbparser.py | |
parent | eb6b1d6375841cd7b87b4f6a08b34ed5239a6354 (diff) | |
download | cheatdb-19e1ed8b32179e3317c807b3ab0581e3b5fb00a2.tar.xz |
Implement forum parser to increase accuracy
Diffstat (limited to 'app/tasks/phpbbparser.py')
-rw-r--r-- | app/tasks/phpbbparser.py | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/app/tasks/phpbbparser.py b/app/tasks/phpbbparser.py index d27ccec..9984ad0 100644 --- a/app/tasks/phpbbparser.py +++ b/app/tasks/phpbbparser.py @@ -5,6 +5,7 @@ import urllib, socket from bs4 import * from urllib.parse import urljoin +from datetime import datetime import urllib.request import os.path import time, re @@ -77,3 +78,72 @@ def getProfile(url, username): __extract_properties(profile, soup) return profile + + +regex_id = re.compile(r"^.*t=([0-9]+).*$") + +def parseForumListPage(id, page, out, extra=None): + num_per_page = 30 + start = page*num_per_page+1 + print(" - Fetching page {} (topics {}-{})".format(page, start, start+num_per_page)) + + url = "https://forum.minetest.net/viewforum.php?f=" + str(id) + "&start=" + str(start) + r = urllib.request.urlopen(url).read().decode("utf-8") + soup = BeautifulSoup(r, "html.parser") + + for row in soup.find_all("li", class_="row"): + classes = row.get("class") + if "sticky" in classes or "announce" in classes or "global-announce" in classes: + continue + + topic = row.find("dl") + + # Link info + link = topic.find(class_="topictitle") + id = regex_id.match(link.get("href")).group(1) + title = link.find(text=True) + + # Date + left = topic.find("dt") + date = left.get_text().split("ยป")[1].strip() + date = datetime.strptime(date, "%a %b %d, %Y %H:%M") + author = left.find_all("a")[-1].get_text().strip() + + # Get counts + posts = topic.find(class_="posts").find(text=True) + views = topic.find(class_="views").find(text=True) + + if id in out: + print(" - got {} again, title: {}".format(id, title)) + assert(title == out[id]['title']) + return False + + row = { + "id" : id, + "title" : title, + "author": author, + "posts" : posts, + "views" : views, + "date" : date + } + + if extra is not None: + for key, value in extra.items(): + row[key] = value + + out[id] = row + + return True + +def getTopicsFromForum(id, out={}, extra=None): + print("Fetching all topics from forum {}".format(id)) + page = 0 + while parseForumListPage(id, page, out, extra): + page = page + 1 + + return out + +def dumpTitlesToFile(topics, path): + with open(path, "w") as out_file: + for topic in topics.values(): + out_file.write(topic["title"] + "\n") |