#!/usr/bin/python # moin2media - convert a MoinMoin wiki to MediaWiki 1.5 import format # Copyright 2006 Free Standards Group, Inc. # Author: Jeff Licquia # Author: Mike McGrath # Permission granted to release under GPL # Altered 2008 by Ignacio Vazquez-Abrams import sys import os import re import elementtree.ElementTree as etree import mx.DateTime import cgi import codecs def _table_xlat(data): in_table = False result = [] for line in data.splitlines(True): if line.startswith(u"||"): if not in_table: in_table = True result.append(u"{| border=\"1\"\n") newline = line[1:] while newline[-1] in (u"|", u" "): newline = newline[:-1] result.append(newline) result.append(u"|-\n") else: if in_table: result.append(u"|}\n") in_table = False result.append(line) return u''.join(result) def _escape(line): # line = line.replace(u">", u">") # line = line.replace(u"<", u"<") # line = re.sub(ur'&(?![a-z]+;)', u"&", line) return (line, {}) def _fix_comments(line): if line.startswith(u"##"): line = u"\n" % line[2:] return (line, {}) def _find_meta(line): try: if line.startswith(u"#"): (name, value) = line[1:].split(u" ", 1) return (u"", { name: value }) except: pass return (line, {}) def _studlycaps(line): # line = re.sub(ur'\b(?", line) line = re.sub(r'\}\}\}', "", line) else: line = re.sub(r'\{\{\{', "
", line)
        line = re.sub(r'\}\}\}', "
", line) return (line, {}) def _unspace_text(line): if len(line) > 0 and line[0] == " ": while len(line) > 0 and line[0] == " ": line = line[1:] line = ": " + line #line = u": %s" % line.lstrip(' ') return (line, {}) def _kill_link_prefixes(line): line = re.sub(ur'[A-Za-z]+\:\[\[', u"[[", line) return (line, {}) def _fix_links(line): line = re.sub(ur'\[\:(.*)\:(.*)\]', ur"[[\1 |\2]]", line) # line = re.sub(r'\[\[', "[[ ", line) # line = re.sub(r'\]\]', " ]]", line) return (line, {}) def _remove_toc(line): if not line.find('TableOfContents') == -1: line = re.sub(r'\[\[.*TableOfContents.*\]\]', '', line) return (line, {}) chain = [ _fix_links, _escape, _fix_comments, _find_meta, _studlycaps, _fix_bullets, _fix_numlists, _fix_pre, _unspace_text, _kill_link_prefixes, _remove_toc ] class MoinWiki(object): def __init__(self, wiki_path): if not os.path.isdir(wiki_path): raise RuntimeError(u"%s: incorrect path to wiki" % wiki_path) if not os.path.exists(u"%s/pages/FrontPage/current" % wiki_path): raise RuntimeError(u"%s: path does not appear to be a" u" MoinMoin wiki" % wiki_path) self.wiki_path = wiki_path def _check_valid_page(self, orig_page_name): if not os.path.exists(u"%s/pages/%s/current" % (self.wiki_path, orig_page_name)): raise RuntimeError(u"page %s does not exist in" u" wiki at %s" % (self.wiki_path, orig_page_name)) def _translate_page_name(self, page_name): new_page_name = page_name if page_name.find(u"(") != -1: for match in re.finditer(ur'\((\w+)\)', page_name): hex = u"\"\\x%s\"" % match.group(1) if len(hex) > 6: #hex = u"%s\\x%s" % (hex[:5], hex[5:]) hex = match.group(1).decode('hex').decode('utf-8') try: newchar = eval(hex) # WTH? -iva except ValueError: raise RuntimeError(u"invalid escaping of %s: %s" % (page_name, hex)) except SyntaxError: newchar = hex try: new_page_name = new_page_name.replace(match.group(0), newchar) except: sys.stderr.write("Error2 - on page: %s\n" % page_name) return new_page_name def _chain_translate_file(self, f): result = [] resultmeta = {} for line in f: for chaincall in chain: (line, meta) = chaincall(line) resultmeta.update(meta) result.append(line) result = _table_xlat(u''.join(result)) return (result, resultmeta) def has_page(self, page_name): try: self._check_valid_page(page_name) except RuntimeError: return False return True def get_orig_page_names(self): for page in os.listdir(self.wiki_path + u"/pages"): try: self._check_valid_page(page) except RuntimeError: continue yield page def get_page(self, orig_page_name): self._check_valid_page(orig_page_name) page_name = self._translate_page_name(orig_page_name) results = { u"name": page_name, u"orig-name": orig_page_name } page_path = u"%s/pages/%s" % (self.wiki_path, orig_page_name) revnum_file = codecs.open(u"%s/current" % page_path, 'r', 'utf-8') revnum = revnum_file.read() revnum_file.close() revnum = revnum.rstrip(u'\n') while not os.path.exists(u"%s/revisions/%s" % (page_path, revnum)): revnum_len = len(revnum) #revnum = str(int(revnum) - 1) revnum = int(revnum) - 1 revnum = u'%0*d' % (revnum_len, revnum) text_file = codecs.open(u"%s/revisions/%s" % (page_path, revnum), 'r', 'utf-8') (results[u"text"], results[u"meta"]) = \ self._chain_translate_file(text_file) text_file.close() return results def get_pages(self): for page in self.get_orig_page_names(): yield self.get_page(page) class MWExport(object): def __init__(self, source): self.source_wiki = source self.etroot = etree.Element(u"mediawiki") self.etroot.set(u"xml:lang", u"en") self.etdoc = etree.ElementTree(self.etroot) self.timestr = mx.DateTime.ISO.strUTC(mx.DateTime.utc()) self.timestr = self.timestr.replace(u" ", u"T") self.timestr = self.timestr.replace(u"+0000", u"Z") def _create_blank_page(self): mwpage = etree.Element(u"page") mwpagetitle = etree.SubElement(mwpage, u"title") mwrevision = etree.SubElement(mwpage, u"revision") mwrevtime = etree.SubElement(mwrevision, u"timestamp") mwrevtime.text = self.timestr mwcontrib = etree.SubElement(mwrevision, u"contributor") mwuser = etree.SubElement(mwcontrib, u"username") mwuser.text = u"ImportUser" mwcomment = etree.SubElement(mwrevision, u"comment") mwcomment.text = u"Imported from MoinMoin" mwtext = etree.SubElement(mwrevision, u"text") return mwpage def add_page(self, page): mwpage = self._create_blank_page() mwpage[0].text = page[u"name"] for subelem in mwpage[1]: if subelem.tag == u"text": subelem.text = page[u"text"] self.etroot.append(mwpage) talk_page_content = [] if self.source_wiki.has_page(page[u"name"] + u"(2f)Comments"): comment_page = self.source_wiki.get_page(page[u"name"] + u"(2f)Comments") talk_page_content.append(comment_page[u"text"]) if len(page[u"meta"]) > 0: talk_page_content.append(u""" The following metadata was found in MoinMoin that could not be converted to a useful value in MediaWiki: """) for key, value in page[u"meta"].iteritems(): talk_page_content.append(u"* %s: %s\n" % (key, value)) if talk_page_content: mwpage = self._create_blank_page() mwpage[0].text = u"Talk:%s" % page[u"name"] for subelem in mwpage[1]: if subelem.tag == u"text": subelem.text = u''.join(talk_page_content) self.etroot.append(mwpage) def add_pages(self): for page in self.source_wiki.get_pages(): if not page[u"name"].endswith(u"(2f)Comments"): self.add_page(page) def write(self, f): self.etdoc.write(f) def main(): wiki_path = sys.argv[1] export = MWExport(MoinWiki(wiki_path)) export.add_pages() out = codecs.EncodedFile(sys.stdout, 'utf-8') out.write(u"\n") export.write(out) if __name__ == "__main__": main()