#!/usr/bin/env python3 # -*- coding: utf-8 -*- # # This file is part of the LibreOffice project. # # This Source Code Form is subject to the terms of the Mozilla Public # License, v. 2.0. If a copy of the MPL was not distributed with this # file, You can obtain one at http://mozilla.org/MPL/2.0/. # # This digs through a pile of bugzilla's and populates the cwd with a big # collection of bug-docs in per-filetype dirs with bug-ids as names with # prefixes to indicate which bug-tracker, e.g. # # fdo-bugid-X.suffix # rhbz-bugid-X.suffix # moz-bugid-X.suffix # # where X is the n'th attachment of that type in the bug # # The results are stored in the current directory, categorized by the # extension of the downloaded file. When a file already exists, it is assumed # it is already downloaded by a previous run, and up-to-date. from __future__ import print_function import base64 import datetime import glob import os import os.path import re import stat import sys import threading try: import queue except Exception: import Queue as queue try: from urllib.request import urlopen except Exception: from urllib import urlopen try: import xmlrpc.client as xmlrpclib except Exception: import xmlrpclib from xml.dom import minidom from xml.sax.saxutils import escape from attachment_mimetypes import mimetypes import feedparser def urlopen_retry(url): """Open url, retry 3 times.""" maxretries = 3 for i in range(maxretries + 1): try: return urlopen(url) except IOError as e: print('caught IOError: ' + str(e)) if maxretries == i: raise print('retrying...') def get_from_bug_url_via_xml(url, mimetype, prefix, suffix): """Parse bug xml, download attachments with matching suffix.""" bugid = url.rsplit('=', 2)[1] print('id is ' + prefix + bugid + ' ' + suffix) print('parsing ' + bugid) sock = urlopen_retry(url+'&ctype=xml') dom = minidom.parse(sock) sock.close() attachmentid = 0 for attachment in dom.getElementsByTagName('attachment'): attachmentid += 1 print(' mimetype is', end=' ') for node in attachment.childNodes: if node.nodeName == 'type': # check if attachment is deleted if not node.firstChild: print('deleted attachment, skipping') continue print(node.firstChild.nodeValue, end=' ') if node.firstChild.nodeValue.lower() != mimetype.lower(): print('skipping') break elif node.nodeName == 'data': # check if attachment is deleted (i.e. https://bugs.kde.org/show_bug.cgi?id=53343&ctype=xml) if not node.firstChild: print('deleted attachment, skipping') continue download = (suffix + '/' + prefix + bugid + '-' + str(attachmentid) + '.' + suffix) if os.path.isfile(download): print('assuming ' + download + ' is up to date') continue # prevent re-downloading FDO attachments from TDF if prefix == 'tdf' and int(bugid) < 88776: fdodownload = download.replace('tdf', 'fdo') if os.path.isfile(fdodownload): print('assuming FDO ' + fdodownload + ' is up to date') continue print('downloading as ' + download) tmpfile = download + '.tmp' f = open(tmpfile, 'wb') f.write(base64.b64decode(node.firstChild.nodeValue)) f.close() os.rename(tmpfile, download) break def get_novell_bug_via_xml(url, mimetype, prefix, suffix): """Parse bug xml, download attachments with matching suffix.""" bugid = url.rsplit('=', 2)[1] print('id is ' + prefix + bugid + ' ' + suffix) print('parsing ' + bugid) sock = urlopen_retry(url+'&ctype=xml') dom = minidom.parse(sock) sock.close() attachmentid = 0 for comment in dom.getElementsByTagName('thetext'): commentText = comment.firstChild.nodeValue match = re.search(r'.*Created an attachment \(id=([0-9]+)\)', commentText) if not match: continue attachmentid += 1 download = (suffix + '/' + prefix + bugid + '-' + str(attachmentid) + '.' + suffix) if os.path.isfile(download): print('assuming ' + download + ' is up to date') continue realAttachmentId = match.group(1) handle = urlopen_retry(novellattach + realAttachmentId) if not handle: print('attachment ' + realAttachmentId + ' is not accessible') continue print(' mimetype is', end=' ') info = handle.info() if info.get_content_type: remoteMime = info.get_content_type() else: remoteMime = info.gettype() print(remoteMime, end=' ') if remoteMime != mimetype: print('skipping') continue print('downloading as ' + download) tmpfile = download + '.tmp' f = open(tmpfile, 'wb') f.write(handle.read()) f.close() os.rename(tmpfile, download) def create_query(mimetype): """Query all bugs with suitable mimetype attachments.""" query = {} query['query_format'] = 'advanced' query['field0-0-0'] = 'attachments.mimetype' query['type0-0-0'] = 'equals' query['value0-0-0'] = mimetype return query def get_downloaded_files(prefix, suffix): """Generate list of existing downloads (matching pre/suffix).""" return glob.glob(os.path.join(suffix, '%s*.%s' % (prefix, suffix))) def get_file_bz_ids(files, prefix): """Generate list of existing downloads (matching pre/suffix).""" return set([os.path.basename(f).split('-')[0].replace(prefix, '', 1) for f in files]) def get_changed_date(files): """Compute date of last downloaded attachment.""" newest = max([os.stat(f)[stat.ST_MTIME] for f in files]) # Subtract a day to avoid timezone differences. The worst thing that # can happen is that we are going to process more bugs than necessary. return datetime.date.fromtimestamp(newest - 24 * 60 * 60) def get_through_rpc_query(rpcurl, showurl, mimetype, prefix, suffix): """Poke Bugzilla via RPC query.""" try: os.mkdir(suffix) except Exception: pass def process(query, full, have=[]): try: proxy = xmlrpclib.ServerProxy(rpcurl) result = proxy.Bug.search(query) bugs = result['bugs'] print(str(len(bugs)) + ' bugs to process') if full: available = set([str(bug['id']) for bug in bugs]) # we already have files from all available bugs if available.difference(set(have)) == set(): print('assuming all downloaded files are up to date') return for bug in bugs: url = showurl + str(bug['id']) get_from_bug_url_via_xml(url, mimetype, prefix, suffix) except xmlrpclib.Fault as err: print('A fault occurred') print('Fault code: ' + err.faultCode) print(err.faultString) query = create_query(mimetype) query['column_list'] = 'bug_id' files = get_downloaded_files(prefix, suffix) if files != []: print('looking for updated bugs having %s attachment(s)' % mimetype) query_changed = query.copy() query_changed['field0-1-0'] = 'days_elapsed' query_changed['type0-1-0'] = 'lessthaneq' query_changed['value0-1-0'] = str((datetime.date.today() - get_changed_date(files)).days) process(query_changed, False) print('looking for all bugs having %s attachment(s)' % mimetype) process(query, True, get_file_bz_ids(files, prefix)) def get_through_rss_query(queryurl, mimetype, prefix, suffix): """Poke Bugzilla via RSS query.""" try: os.mkdir(suffix) except Exception: pass # Getting detailed bug information and downloading an attachment # body is not possible without logging in to Novell bugzilla # get_novell_bug_via_xml function is a workaround for that # situation get_bug_function = get_novell_bug_via_xml if prefix == 'novell' else get_from_bug_url_via_xml def process(query, full, have=[]): url = queryurl + '?' + '&'.join(['='.join(kv) for kv in query.items()]) print('url is ' + url) d = feedparser.parse(url) print(str(len(d['entries'])) + ' bugs to process') entries = d['entries'] if full: available = set([str(entry['id'].split('=')[-1]) for entry in entries]) # we already have files from all available bugs if available.difference(set(have)) == set(): print('assuming all downloaded files are up to date') return for entry in entries: try: get_bug_function(entry['id'], mimetype, prefix, suffix) except KeyboardInterrupt: raise # Ctrl+C should work except Exception: print(entry['id'] + ' failed: ' + str(sys.exc_info()[0])) pass query = create_query(escape(mimetype.replace('+', '%2B'))) query['ctype'] = 'rss' files = get_downloaded_files(prefix, suffix) if files != []: print('looking for updated bugs having %s attachment(s)' % mimetype) query_changed = query.copy() query_changed['field0-1-0'] = 'delta_ts' query_changed['type0-1-0'] = 'greaterthaneq' query_changed['value0-1-0'] = get_changed_date(files).isoformat() process(query_changed, False) print('looking for all bugs having %s attachment(s)' % mimetype) process(query, True, get_file_bz_ids(files, prefix)) # since searching bugs having attachments with specific mimetypes is not # available in launchpad API: # we're iterating over all bugs of the most interesting source packages launchpad_pkgs = ( 'abiword', 'calibre', 'calligra', 'gnumeric', 'inkscape', 'koffice', 'libabw', 'libcdr', 'libe-book', 'libetonyek', 'libfreehand', 'libmspub', 'libmwaw', 'liborcus', 'libpagemaker', 'libreoffice', 'libvisio', 'libwpd', 'libwpg', 'libwps', 'openoffice.org', 'python-uniconvertor', 'scribus', 'sk1', 'unoconv', ) def get_launchpad_bugs(prefix): """Query launchpad bugtracker (via launchpadlib).""" # launchpadlib python module is required to download launchpad attachments from launchpadlib.launchpad import Launchpad launchpad = Launchpad.login_anonymously('attachmentdownload', 'production') ubuntu = launchpad.distributions['ubuntu'] for pkg in launchpad_pkgs: srcpkg = ubuntu.getSourcePackage(name=pkg) pkgbugs = srcpkg.searchTasks(status=['New', 'Fix Committed', 'Invalid', "Won't Fix", 'Confirmed', 'Triaged', 'In Progress', 'Incomplete', 'Incomplete (with response)', 'Incomplete (without response)', 'Fix Released', 'Opinion', 'Expired']) for bugtask in pkgbugs: bug = bugtask.bug bugid = str(bug.id) print('parsing ' + bugid + ' status: ' + bugtask.status + ' title: ' + bug.title[:50]) attachmentid = 0 for attachment in bug.attachments: attachmentid += 1 handle = attachment.data.open() if handle.content_type not in mimetypes: #print "skipping" continue suffix = mimetypes[handle.content_type] if not os.path.isdir(suffix): try: os.mkdir(suffix) except Exception: pass download = (suffix + '/' + prefix + bugid + '-' + str(attachmentid) + '.' + suffix) if os.path.isfile(download): print('assuming ' + bugid + ' is up to date') break print('mimetype is ' + handle.content_type + ' downloading as ' + download) tmpfile = download + '.tmp' f = open(tmpfile, 'wb') f.write(handle.read()) f.close() os.rename(tmpfile, download) rss_bugzillas = ( # note: currently abisource has an expired TLS cert # ('abi', 'http://bugzilla.abisource.com/buglist.cgi'), #added for abiword ('fdo', 'http://bugs.freedesktop.org/buglist.cgi'), ('gentoo', 'http://bugs.gentoo.org/buglist.cgi'), # ('gnome', 'http://bugzilla.gnome.org/buglist.cgi' ), # added for gnumeric ('kde', 'http://bugs.kde.org/buglist.cgi'), # added for koffice/calligra ('mandriva', 'https://qa.mandriva.com/buglist.cgi'), ('moz', 'https://bugzilla.mozilla.org/buglist.cgi'), # It seems something has changed and it is no longer possible to # download any files from there. # NOTE: I am leaving it in the list, commented out, just so someone # does not add it back immediately .-) # 'novell': 'https://bugzilla.novell.com/buglist.cgi', # note: running this script against bz.apache.org apparently causes one's IP # to be banned or something; you won't get new files in any case... # ('ooo', 'https://bz.apache.org/ooo/buglist.cgi'), ('tdf', 'http://bugs.documentfoundation.org/buglist.cgi'), ) redhatrpc = 'https://bugzilla.redhat.com/xmlrpc.cgi' redhatbug = 'https://bugzilla.redhat.com/show_bug.cgi?id=' # Novell Bugzilla requires users to log in, in order to get details of # the bugs such as attachment bodies etc. As a dirty workaround, we # parse comments containing "Created an attachment (id=xxxxxx)" and # download attachments manually python-bugzilla claims that it # supports Novell bugzilla login but it's not working right now and # novell bugzilla login system is a nightmare novellattach = 'https://bugzilla.novell.com/attachment.cgi?id=' class manage_threads(threading.Thread): def run(self): while 1: # Try to receive a job from queue try: # Get job from queue # Use job parameters to call our query # Then let the queue know we are done with this job (uri, mimetype, prefix, extension) = jobs.get(True, 6) try: # set thread name for easier debugging, if process # ctl package is available import prctl prctl.set_name(prefix[:3] + ': ' + mimetype[-10:]) except ImportError: pass try: get_through_rss_query(uri, mimetype, prefix, extension) finally: jobs.task_done() except KeyboardInterrupt: raise # Ctrl+C should work except queue.Empty: break def generate_multi_threading(): # Initialize threads for _i in range(max_threads): manage_threads().start() for (prefix, uri) in rss_bugzillas: # Create a job for every mimetype for a bugzilla for (mimetype, extension) in mimetypes.items(): # It seems that bugzilla has problems returning that many results # (10000 results is probably a limit set somewhere) so we always # end processing the complete list. if mimetype == 'text/html' and prefix == 'moz': continue jobs.put([uri, mimetype, prefix, extension], block=True) print('successfully placed a job in the queue searching for ' + mimetype + ' in bugtracker ' + prefix) # Continue when all mimetypes are done for a bugzilla print('STARTED all bugtracker ' + prefix) jobs.join() # Number of threads to create, (1 = without multi-threading, default = 20) max_threads = int(os.environ.get('PARALLELISM', 20)) jobs = queue.Queue() generate_multi_threading() for (mimetype, extension) in mimetypes.items(): get_through_rpc_query(redhatrpc, redhatbug, mimetype, 'rhbz', extension) try: get_launchpad_bugs('lp') except ImportError: print('launchpadlib unavailable, skipping Ubuntu tracker') # vim:set shiftwidth=4 softtabstop=4 expandtab: