Jump to content

User:Psiĥedelisto/VisualEditor ref namer.py

From Wikipedia, the free encyclopedia

The VisualEditor, (very annoyingly!), doesn't name references added by users, and gives them names like :0, :1, etc. This script fixes that automatically. Might be buggy, only ever tested on osteogenesis imperfecta and furry fandom.

Requires mwparserfromhell. Input filename is first and only argument. Outputs completed wiki page to stdout, and some info on what changed to stderr.

#!/usr/bin/env/python3

import mwparserfromhell
from mwparserfromhell.wikicode import Tag, Wikicode, Wikilink
import re
import sys

_, input_filename = sys.argv

with open(input_filename) as f:
    inp = f.read()

parsed = mwparserfromhell.parse(inp)

get_all_links = lambda parsed: parsed.ifilter(forcetype=Wikilink, recursive=True)
get_all_tags = lambda: parsed.ifilter(forcetype=Tag, matches="<\\s*ref\\s*", recursive=True)
tags = list(filter(None, [t if t.has("name") else None for t in get_all_tags()]))
tags_noname = list(filter(None, [t if not t.has("name") else None for t in get_all_tags()]))
tags_noname_idxs = list()
for tag in tags_noname:
    for i, tag2 in enumerate(get_all_tags()):
        if tag == tag2:
            tags_noname_idxs.append(i)
assert len(tags_noname_idxs) == len(tags_noname)
refs = list(filter(lambda s: re.search("^:\d+$", str(s.get("name").value)) and not re.search("/>$", str(s)), tags))

def find_date(template):
    date_candidates = {v: template.has(v) for v in ["date", "year", "airdate"]}
    if any(date_candidates.values()):
        date = [k for k, v in date_candidates.items() if v][0]
        date = str(template.get(date).value)
    else:
        return None

    m = re.search("\d{4}", date)

    return (str(m.group(0)) if m else None)

def by_work(v, template):
    parsed_v = mwparserfromhell.parse(v)
    for v in get_all_links(parsed_v):
        parsed_v.replace(v, str(v.title))
        
    v = str(parsed_v)
    
    date = find_date(template)
    if date is None:
        return None
    work = re.sub("\s", "", v)
    if len(work.strip()) == 0:
        return None
    return "{}{}".format(work, date)

def by_surname(v, template):
    if "," in v:
        last = v[:v.index(",")]
    elif " " in v:
        last = v[:v.index(" ")]
    else:
        last = v

    if len(last.strip()) == 0:
        return None

    date = find_date(template)

    if date is None:
        return None

    return "{}{}".format(last, date)

def build_refs(refs):
    global tags_noname_idxs
    pretty = dict()

    for (i, ref) in enumerate(refs):
        template = ref.contents.get(0)

        if not getattr(template, "has", False):
            continue

        last_candidates = {v: template.has(v) for v in ["vauthors", "authors", "last"]}
        work_candidates = {v: template.has(v) for v in ["work", "website", "publisher", "series-link", "series"]}
        if any(last_candidates.values()):
            last = [k for k, v in last_candidates.items() if v][0]
            v = by_surname(str(template.get(last).value), template)
        elif any(work_candidates.values()):
            work = [k for k, v in work_candidates.items() if v][0]
            v = by_work(str(template.get(work).value), template)
        else:
            continue

        if v is None:
            continue
        elif len(v.strip()) <= 1:
            continue

        if ref.has("name"):
            pretty[str(ref.get("name").value)] = v
        else:
            pretty[tags_noname_idxs[i]] = v

    return pretty

pretty = build_refs(refs)
pretty_noname = build_refs(tags_noname)

for i, tag in enumerate(get_all_tags()):
    if tag.has("name"):
        k = str(tag.get("name").value)
        if k in pretty:
            tag.attributes[0].value = pretty[k]
    else:
        if i in pretty_noname:
            tag.add("name", value = pretty_noname[i])

for template in parsed.ifilter_templates():
    tn = template.name.strip()
    if tn.lower() == "rp" or tn.lower() == "ill" or tn.lower() == "lang" or tn.lower().startswith("lang-") or tn.lower() == "respell" or tn.lower() == "abbr":
        template.name = tn[0].lower()+tn[1:]
    else:
        template.name = tn[0].upper()+tn[1:]
    print(tn, "⇒", template.name, file=sys.stderr)

print(parsed)

for k,v in pretty.items():
    print(k, "⇒", v, file=sys.stderr)
for i,v in pretty_noname.items():
    print("NONAME", i, "⇒", v, file=sys.stderr)

uniq = len(set(pretty.values()))
total = len(pretty.values())
if uniq == total:
    print("All replacements unique", file=sys.stderr)
else:
    print("Some replacements not unique: {}/{}!".format(total-uniq, total))