Пређи на садржај

Корисник:Rainman/Bizma.py

С Википедије, слободне енциклопедије

Korišćenje programa:

import bizma
bizma.plainText('srwiki-latest-stub-meta-history.xml')   # za lepo formatiran plaintext
bizma.wikiText('srwiki-latest-stub-meta-history.xml')   # za wiki tabelu samo sa ukupnim brojem promena
bizma.wikiTextByNs('srwiki-latest-stub-meta-history.xml') # za wiki tabele za zvaki imenski prostor
bizma.wikiTextAllTable('srwiki-latest-stub-meta-history.xml') # za veliku wiki tabelu sa svim informacija

Program koristi dump-ove sa adrese http://download.wikimedia.org/srwiki i to fajl srwiki-latest-stub-meta-history.xml (fajl se može proslediti i neotpakovan, tj u .gz formatu, međutim tada izvršavanje traje nešto duže). Program proizvodi izlazni fajl sa vremenskim žigom, i ukoliko imate dovoljno slobodne memorije trebalo bi da završi za nekih 5-6 sekundi.

import re
import operator
import time
import gzip

def getContrib(filename):
    ''' Calculates editcounts for every user and returns sorted list of (user, editcount)
        filename should be *-stub-meta-history.xml (or .gz)
    '''
    if filename.endswith('.gz'):
        f = gzip.open(filename,'r')
    else:
        f = open(filename,'r')

    contribs = {}
    r = re.compile('<username>(.*?)</username>')

    for line in f:
        user = r.findall(line)
        if len(user)!=0:
            user = user[0]
            if user in contribs:
                contribs[user]+=1
            else:
                contribs[user]=1


    f.close()
        
    return sorted(contribs.items(), key=operator.itemgetter(1), reverse=True)

def filterNamespace(ns, item):
    ''' get item in format (username, {ns1:editcount, ns2:editcount})
        and from it return (username, ns_editcount), 
        if key ns does not exist default to 0
    '''
    if ns in item[1]:
        return (item[0],item[1][ns])
    else:
        return (item[0],0)

def getContribByNamespace(filename):
    ''' similar to getContrib, but returns the triplet:
        * contribs - dictionary: {username:{ns1:editcount, n2:editcount}}
        * namespaces - list of namespaces
        * nsContrib - return list [(ns1,sortedUserList),(ns2,sortedUserList)...]
                      where sortedUserList is a sorted list of (user, editcount)
    '''
    if filename.endswith('.gz'):
        f = gzip.open(filename,'r')
    else:
        f = open(filename,'r')

    rn = re.compile('<namespace key.*?>(.*?)</namespace>')
    namespaces = ['Glavni']

    # read namespaces
    for line in f:
        ns = rn.findall(line)
        if len(ns)!=0:
            namespaces.append(ns[0])

        if '</siteinfo>' in line:
            break

    # read contribs
    contribs = {}
    r = re.compile('<username>(.*?)</username>')
    rt = re.compile('<title>(.*?)</title>')

    for line in f:
        t = rt.findall(line)
        if len(t)!=0:
            t=t[0]
            ind = t.find(':')
            if ind == -1:
                ns = 'Glavni'
                title = t
            else:
                ns = t[:ind]
                title = t[ind+1:]
                    
        user = r.findall(line)
        if len(user)!=0:
            user = user[0]
            if user in contribs:
                if ns in contribs[user]:
                    contribs[user][ns]+=1
                else:
                    contribs[user][ns]=1
            else:
                contribs[user]={ns:1}

    f.close()

    # return sorted list of tuple ( namespace, sortedlist of user-editcount pairs )
    nsContrib = []

    for ns in namespaces:
        list = [filterNamespace(ns,item) for item in contribs.items()]
        nsContrib.append((ns, sorted(list, key=operator.itemgetter(1), reverse=True)))

    return contribs, namespaces, nsContrib

def plainText(filename):
    ''' prints sorted list of pairs user,editcount in plaintext pretty-print
    '''

    sContrib = getContrib(filename)

    name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())

    out = open(name,'w')

    # find unicode chars
    rUnicode = re.compile('[\xc0-\xff][\x80-\xbf]*')

    out.write('Rank   %-30s -> %8s\n' % ('Username','edits'))
    out.write('----------------------------------------------------\n');
    rank = 1
    for contr in sContrib:
        uni = rUnicode.findall(contr[0])
        out.write('%-6d %-30s%s -> %8d\n' % (rank,contr[0],' '*len(uni),contr[1]))
        rank += 1

    out.close();
        
    print('\nDONE: Wrote contrib table to file %s' % name)

def wikiOutput(out,contrib):
    ''' wiki table output of sorted contrib list (user,editcount)
    '''
    rank = 1;
    out.write('{|\n| Rang || Korisnik || Broj izmena\n')
    for contr in contrib:
        if contr[1]!=0:
            out.write('|-\n|%d. || [[Korisnik:%s|%s]] || %d\n' % (rank,contr[0],contr[0],contr[1]))
            rank += 1

    out.write('|}\n')


def wikiText(filename):
    ''' prints sorted list of pairs user,editcount in form of a wikitable
    '''
    sContrib = getContrib(filename)

    name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())

    out = open(name,'w')
    wikiOutput(out,sContrib)
    out.close()
        
    print('\nDONE: Wrote contrib table to file %s' % name)


def wikiTextByNs(filename):
    ''' print sorted lists of contributors for every namespace seperate, in wikitable
    '''
    contribs, ns, nsContribs = getContribByNamespace(filename)

    name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())
    out = open(name,'w')

    for nsContrib in nsContribs:
         out.write('=== %s ===\n' % nsContrib[0])
         wikiOutput(out,nsContrib[1])

    out.close()
        
    print('\nDONE: Wrote contrib table to file %s' % name)


def wikiTextAllTable(filename):
    ''' prints big table of contributors with total number of edits, and edits per very
        namespace, in a wiki table
    '''

    # this is inefficient, uses 2 passes thru the xml, but I was lazy to reimplement
    # the dictionary based getContribByNamespace
    sContrib = getContrib(filename)
    contribs, namespaces, nsContribs = getContribByNamespace(filename)

    name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())
    out = open(name,'w')

    out.write('{|\n| Rang || Korisnik || %s || Ukupan broj\n' % ' || '.join(namespaces))

    rank = 1

    # use sContrib for sorting, dicitionary contribs for namespace info
    for sc in sContrib:
        c = contribs[sc[0]]
        out.write('|-\n|%d. || [[Korisnik:%s|%s]] ||' % (rank,sc[0],sc[0]))
        sum = 0
        for ns in namespaces:
            if ns in c:
                out.write(' %d ||' % c[ns])
            else:
                out.write ('  ||')

        out.write(' %d\n' % sc[1])
        rank += 1

    out.write('|}\n')

    out.close()
        
    print('\nDONE: Wrote contrib table to file %s' % name)