Корисник:Rainman/Bizma.py
Изглед
Korišćenje programa:
import bizma
bizma.plainText('srwiki-latest-stub-meta-history.xml') # za lepo formatiran plaintext
bizma.wikiText('srwiki-latest-stub-meta-history.xml') # za wiki tabelu samo sa ukupnim brojem promena
bizma.wikiTextByNs('srwiki-latest-stub-meta-history.xml') # za wiki tabele za zvaki imenski prostor
bizma.wikiTextAllTable('srwiki-latest-stub-meta-history.xml') # za veliku wiki tabelu sa svim informacija
Program koristi dump-ove sa adrese http://download.wikimedia.org/srwiki i to fajl srwiki-latest-stub-meta-history.xml (fajl se može proslediti i neotpakovan, tj u .gz formatu, međutim tada izvršavanje traje nešto duže). Program proizvodi izlazni fajl sa vremenskim žigom, i ukoliko imate dovoljno slobodne memorije trebalo bi da završi za nekih 5-6 sekundi.
import re
import operator
import time
import gzip
def getContrib(filename):
''' Calculates editcounts for every user and returns sorted list of (user, editcount)
filename should be *-stub-meta-history.xml (or .gz)
'''
if filename.endswith('.gz'):
f = gzip.open(filename,'r')
else:
f = open(filename,'r')
contribs = {}
r = re.compile('<username>(.*?)</username>')
for line in f:
user = r.findall(line)
if len(user)!=0:
user = user[0]
if user in contribs:
contribs[user]+=1
else:
contribs[user]=1
f.close()
return sorted(contribs.items(), key=operator.itemgetter(1), reverse=True)
def filterNamespace(ns, item):
''' get item in format (username, {ns1:editcount, ns2:editcount})
and from it return (username, ns_editcount),
if key ns does not exist default to 0
'''
if ns in item[1]:
return (item[0],item[1][ns])
else:
return (item[0],0)
def getContribByNamespace(filename):
''' similar to getContrib, but returns the triplet:
* contribs - dictionary: {username:{ns1:editcount, n2:editcount}}
* namespaces - list of namespaces
* nsContrib - return list [(ns1,sortedUserList),(ns2,sortedUserList)...]
where sortedUserList is a sorted list of (user, editcount)
'''
if filename.endswith('.gz'):
f = gzip.open(filename,'r')
else:
f = open(filename,'r')
rn = re.compile('<namespace key.*?>(.*?)</namespace>')
namespaces = ['Glavni']
# read namespaces
for line in f:
ns = rn.findall(line)
if len(ns)!=0:
namespaces.append(ns[0])
if '</siteinfo>' in line:
break
# read contribs
contribs = {}
r = re.compile('<username>(.*?)</username>')
rt = re.compile('<title>(.*?)</title>')
for line in f:
t = rt.findall(line)
if len(t)!=0:
t=t[0]
ind = t.find(':')
if ind == -1:
ns = 'Glavni'
title = t
else:
ns = t[:ind]
title = t[ind+1:]
user = r.findall(line)
if len(user)!=0:
user = user[0]
if user in contribs:
if ns in contribs[user]:
contribs[user][ns]+=1
else:
contribs[user][ns]=1
else:
contribs[user]={ns:1}
f.close()
# return sorted list of tuple ( namespace, sortedlist of user-editcount pairs )
nsContrib = []
for ns in namespaces:
list = [filterNamespace(ns,item) for item in contribs.items()]
nsContrib.append((ns, sorted(list, key=operator.itemgetter(1), reverse=True)))
return contribs, namespaces, nsContrib
def plainText(filename):
''' prints sorted list of pairs user,editcount in plaintext pretty-print
'''
sContrib = getContrib(filename)
name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())
out = open(name,'w')
# find unicode chars
rUnicode = re.compile('[\xc0-\xff][\x80-\xbf]*')
out.write('Rank %-30s -> %8s\n' % ('Username','edits'))
out.write('----------------------------------------------------\n');
rank = 1
for contr in sContrib:
uni = rUnicode.findall(contr[0])
out.write('%-6d %-30s%s -> %8d\n' % (rank,contr[0],' '*len(uni),contr[1]))
rank += 1
out.close();
print('\nDONE: Wrote contrib table to file %s' % name)
def wikiOutput(out,contrib):
''' wiki table output of sorted contrib list (user,editcount)
'''
rank = 1;
out.write('{|\n| Rang || Korisnik || Broj izmena\n')
for contr in contrib:
if contr[1]!=0:
out.write('|-\n|%d. || [[Korisnik:%s|%s]] || %d\n' % (rank,contr[0],contr[0],contr[1]))
rank += 1
out.write('|}\n')
def wikiText(filename):
''' prints sorted list of pairs user,editcount in form of a wikitable
'''
sContrib = getContrib(filename)
name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())
out = open(name,'w')
wikiOutput(out,sContrib)
out.close()
print('\nDONE: Wrote contrib table to file %s' % name)
def wikiTextByNs(filename):
''' print sorted lists of contributors for every namespace seperate, in wikitable
'''
contribs, ns, nsContribs = getContribByNamespace(filename)
name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())
out = open(name,'w')
for nsContrib in nsContribs:
out.write('=== %s ===\n' % nsContrib[0])
wikiOutput(out,nsContrib[1])
out.close()
print('\nDONE: Wrote contrib table to file %s' % name)
def wikiTextAllTable(filename):
''' prints big table of contributors with total number of edits, and edits per very
namespace, in a wiki table
'''
# this is inefficient, uses 2 passes thru the xml, but I was lazy to reimplement
# the dictionary based getContribByNamespace
sContrib = getContrib(filename)
contribs, namespaces, nsContribs = getContribByNamespace(filename)
name = 'sortedContribList-%s' % time.strftime('%Y-%m-%d@%H:%M:%S',time.localtime())
out = open(name,'w')
out.write('{|\n| Rang || Korisnik || %s || Ukupan broj\n' % ' || '.join(namespaces))
rank = 1
# use sContrib for sorting, dicitionary contribs for namespace info
for sc in sContrib:
c = contribs[sc[0]]
out.write('|-\n|%d. || [[Korisnik:%s|%s]] ||' % (rank,sc[0],sc[0]))
sum = 0
for ns in namespaces:
if ns in c:
out.write(' %d ||' % c[ns])
else:
out.write (' ||')
out.write(' %d\n' % sc[1])
rank += 1
out.write('|}\n')
out.close()
print('\nDONE: Wrote contrib table to file %s' % name)