#!/usr/bin/env python3 # This file is part of the LibreOffice project. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. import argparse import configparser import hashlib import magic import os import requests import shutil import sys import tempfile from bs4 import BeautifulSoup from attachment_mimetypes import mimetypes from concurrent.futures import ThreadPoolExecutor, as_completed from requests.adapters import HTTPAdapter from requests.packages.urllib3.util.retry import Retry forums = { # https://wiki.documentfoundation.org/Website/Web_Sites_services#Unofficial_and_Related_Pages 'en': ["https://forum.openoffice.org/en/forum", False, 0], 'es': ["https://forum.openoffice.org/es/forum", False, 0], 'fr': ["https://forum.openoffice.org/fr/forum", False, 0], 'hu': ["https://forum.openoffice.org/hu/forum", False, 1300], 'it': ["https://forum.openoffice.org/it/forum", False, 0], 'ja': ["https://forum.openoffice.org/ja/forum", False, 0], 'nl': ["https://forum.openoffice.org/nl/forum", False, 0], 'pl': ["https://forum.openoffice.org/pl/forum", False, 0], 'vi': ["https://forum.openoffice.org/vi/forum", False, 0], 'tr': ["https://forum.libreoffice.org.tr", False, 0], 'de': ["https://www.openoffice-forum.de", False, 0], 'de2': ["https://www.libreoffice-forum.de", False, 0], 'de3': ["https://de.openoffice.info", False, 0], # Others 'mso-de': ["https://www.ms-office-forum.net/forum", True, 0], 'mso-en': ["https://www.msofficeforums.com", True, 0], 'mso-en2': ["https://www.excelguru.ca/forums", False, 0], 'mso-en3': ["http://www.vbaexpress.com/forum", True, 5100], 'mso-en4': ["https://www.excelforum.com", True, 5100], # forum : [url, doLogin, startIndex] } def get_attachment_query(forum): if forum.startswith("mso"): return "/attachment.php?attachmentid=" else: return "/download/file.php?id=" def createSession(): session = requests.Session() retry = Retry(connect=3, backoff_factor=0.5) adapter = HTTPAdapter(max_retries=retry) session.mount('http://', adapter) session.mount('https://', adapter) return session def login(session, url, configFile): config = configparser.ConfigParser() config.read(configFile) username = config.get('login', 'username') password = config.get('login', 'password') resp = session.post(url + '/login.php?do=login', { 'vb_login_username': username, 'vb_login_password': '', 'vb_login_md5password': hashlib.md5(password.encode()).hexdigest(), 'vb_login_md5password_utf': hashlib.md5(password.encode()).hexdigest(), 'cookieuser': 1, 'do': 'login', 's': '', 'securitytoken': 'guest' }) if resp.status_code != 200: return False soup = BeautifulSoup(resp.content, 'lxml') for p in soup.find_all("p"): if 'Thank you for logging in' in p.get_text(): return True elif 'Danke für Ihre Anmeldung' in p.get_text(): return True return False def get_attachments_from_url(forum, config, args): url = config[0] doLogin = config[1] startIndex = config[2] print("Checking " + url) # Keep the index and resume from there indexFile = os.path.join(args.outdir, forum + ".index") if os.path.isfile(indexFile): with open(indexFile) as f: startIndex = int(f.readline().rstrip()) + 1 session = createSession() if doLogin: if not login(session, url, args.config): print("Can't log in to " + url) return invalidCount = 0 for i in range(startIndex, 999999): fileUrl = url + get_attachment_query(forum) + str(i) h = session.head(fileUrl) header = h.headers content_type = header.get('content-type') if "html" in content_type: # Let's assume this is an invalid file link invalidCount += 1 # Let's assume, if we get 200 invalid files, that there are no more files if invalidCount == 200: print("No more attachments found in " + url) break else: invalidCount = 0 r = session.get(fileUrl, allow_redirects=True) with tempfile.NamedTemporaryFile() as tmp: tmp.write(r.content) mimetype = magic.from_file(tmp.name, mime=True) if mimetype in mimetypes: suffix = mimetypes[mimetype] suffixDir = os.path.join(args.outdir, suffix) try: os.mkdir(suffixDir) except: pass download = os.path.join(suffixDir, "forum-" + forum + '-' + str(i) + '.' + suffix) print("Downloading as " + download) shutil.copy(tmp.name, download) # Save the index with open(indexFile, 'w') as f: f.write(str(i)) if __name__ == '__main__': parser = argparse.ArgumentParser() parser.add_argument('--outdir', action='store', dest="outdir", required=True) parser.add_argument('--config', action="store", dest="config", required=True) parser.add_argument('--get-file', action="store", dest="fileName", required=False) args = parser.parse_args() if not os.path.exists(args.outdir) or os.path.isfile(args.outdir): print("Outdir folder doesn't exists") sys.exit(1) elif not os.path.exists(args.config) or not os.path.isfile(args.config): print("Config file doesn't exists") sys.exit(1) if not args.fileName: processes = [] # by default, 10 at a time seems to work fine with ThreadPoolExecutor(max_workers=int(os.environ.get('PARALLELISM', 10))) as executor: for forum, config in forums.items(): processes.append(executor.submit(get_attachments_from_url, forum, config, args)) for task in as_completed(processes): result = task.result() if result: print(result) else: fileNameSplit = args.fileName.split("-") if fileNameSplit[0] != "forum" or (len(fileNameSplit) != 3 and len(fileNameSplit) != 4): print("Incorrect file name") sys.exit(1) forum = fileNameSplit[1] fileId = fileNameSplit[2] if fileNameSplit[1] == "mso": forum += "-" + fileNameSplit[2] fileId = fileNameSplit[3] url = forums[forum][0] fileUrl = url + get_attachment_query(forum) + fileId.split(".")[0] session = createSession() doLogin = forums[forum][1] if doLogin: if not login(session, url, args.config): print("Can't log in to " + url) sys.exit(1) r = session.get(fileUrl, allow_redirects=True) with tempfile.NamedTemporaryFile() as tmp: tmp.write(r.content) download = os.path.join(args.outdir, args.fileName) print("Downloading " + fileUrl + " as " + download) shutil.copy(tmp.name, download)