Fancy stats!

This commit is contained in:
pietervdvn 2021-03-13 15:31:28 +01:00
parent 558265eba9
commit 0d60991f57
25 changed files with 3057 additions and 2953 deletions

30
Docs/Stats.md Normal file
View file

@ -0,0 +1,30 @@
Statistics
==========
There are some fancy statistics available about MapComplete use. You can find all the stats (and the scripts to generate them) [here](Docs/Tools/)
All Time usage
--------------
![Cumultive contributors](Docs/Tools/CumulativeContributors.png)
![Cumulative changesets per contributor](Docs/Tools/Cumulative changesets per contributor.png)
Note: in 2020, MapComplete would still make one changeset per answered question. This heavily skews the below graphs towards `buurtnatuur` and `cyclofìx`, two heavily used themes at the beginning.
![Cumulative changesets per theme](Cumulative changesets per theme.png)
![Theme distribution](Docs/Tools/Theme distribution.png)
2020
----
![Cumultive contributors](Docs/Tools/CumulativeContributors in 2020.png)
![Cumulative changesets per contributor](Docs/Tools/Cumulative changesets per contributor in 2020.png)
![Cumulative changesets per theme](Cumulative changesets per theme in 2020.png)
![Theme distribution](Docs/Tools/Theme distribution in 2020.png)
2021
----
![Cumultive contributors](Docs/Tools/CumulativeContributors in 2021.png)
![Cumulative changesets per contributor](Docs/Tools/Cumulative changesets per contributor in 2021.png)
![Cumulative changesets per theme](Cumulative changesets per theme in 2021.png)
![Theme distribution](Docs/Tools/Theme distribution in 2021.png)

Binary file not shown.

After

Width:  |  Height:  |  Size: 388 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 341 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 496 KiB

After

Width:  |  Height:  |  Size: 493 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 644 KiB

After

Width:  |  Height:  |  Size: 645 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 708 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 217 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 602 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 104 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 102 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 109 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 279 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 301 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 310 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 528 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 224 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 710 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 273 KiB

After

Width:  |  Height:  |  Size: 272 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 269 KiB

After

Width:  |  Height:  |  Size: 270 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 535 KiB

After

Width:  |  Height:  |  Size: 599 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 550 KiB

After

Width:  |  Height:  |  Size: 551 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 760 KiB

After

Width:  |  Height:  |  Size: 758 KiB

View file

@ -4,10 +4,6 @@ from datetime import datetime
from matplotlib import pyplot from matplotlib import pyplot
def clean(s):
return s.strip().strip("\"")
def counts(lst): def counts(lst):
counts = {} counts = {}
for v in lst: for v in lst:
@ -75,21 +71,25 @@ class Hist:
csv += "\n" csv += "\n"
return csv return csv
def __str__(self):
return str(self.dictionary)
def build_hist(stats, keyIndex, valueIndex, condition=None):
def build_hist(stats, keyIndex, valueIndex):
hist = Hist("date") hist = Hist("date")
c = 0 c = 0
for row in stats: for row in stats:
if condition is not None and not condition(row):
continue
c += 1 c += 1
row = list(map(clean, row))
hist.add(row[keyIndex], row[valueIndex]) hist.add(row[keyIndex], row[valueIndex])
return hist return hist
def cumulative_users(stats, year=""): def as_date(str):
users_hist = build_hist(stats, 0, 1, lambda row: row[0].startswith(year)) return datetime.strptime(str, "%Y-%m-%d")
def cumulative_users(stats):
users_hist = build_hist(stats, 0, 1)
all_users_per_day = users_hist.mapcumul( all_users_per_day = users_hist.mapcumul(
lambda users: set(users), lambda users: set(users),
lambda a, b: a.union(b), lambda a, b: a.union(b),
@ -100,7 +100,7 @@ def cumulative_users(stats, year=""):
new_users = [0] new_users = [0]
for i in range(len(cumul_uniq) - 1): for i in range(len(cumul_uniq) - 1):
new_users.append(cumul_uniq[i + 1] - cumul_uniq[i]) new_users.append(cumul_uniq[i + 1] - cumul_uniq[i])
dates = map(lambda dt: datetime.strptime(dt, "%Y-%m-%d"), users_hist.keys()) dates = map(as_date, users_hist.keys())
return list(dates), cumul_uniq, list(unique_per_day), list(new_users) return list(dates), cumul_uniq, list(unique_per_day), list(new_users)
@ -110,66 +110,35 @@ def pyplot_init():
pyplot.tight_layout() pyplot.tight_layout()
def create_usercount_graphs(stats, year="", show=False): def create_usercount_graphs(stats, extra_text=""):
print("Creating usercount graphs "+year) print("Creating usercount graphs " + extra_text)
dates, cumul_uniq, unique_per_day, new_users = cumulative_users(stats, year) dates, cumul_uniq, unique_per_day, new_users = cumulative_users(stats)
total = cumul_uniq[-1] total = cumul_uniq[-1]
if year != "":
year = " in " + year
pyplot_init() pyplot_init()
pyplot.fill_between(dates, unique_per_day, label='Unique contributors') pyplot.fill_between(dates, unique_per_day, label='Unique contributors')
pyplot.fill_between(dates, new_users, label='First time contributor via MapComplete') pyplot.fill_between(dates, new_users, label='First time contributor via MapComplete')
pyplot.legend() pyplot.legend()
pyplot.title("Unique contributors" + year + ' with MapComplete (' + str(total) + ' contributors)') pyplot.title("Unique contributors" + extra_text + ' with MapComplete (' + str(total) + ' contributors)')
pyplot.ylabel("Number of unique contributors") pyplot.ylabel("Number of unique contributors")
pyplot.xlabel("Date") pyplot.xlabel("Date")
if show: pyplot.savefig("Contributors" + extra_text + ".png", dpi=400, facecolor='w', edgecolor='w', bbox_inches='tight')
pyplot.show()
else:
pyplot.savefig("Contributors" + year + ".png", dpi=400, facecolor='w', edgecolor='w', bbox_inches='tight')
pyplot_init() pyplot_init()
pyplot.plot(dates, cumul_uniq, label='Cumulative unique contributors') pyplot.plot(dates, cumul_uniq, label='Cumulative unique contributors')
pyplot.legend() pyplot.legend()
pyplot.title("Cumulative unique contributors" + year + " with MapComplete - " + str(total) + " contributors") pyplot.title("Cumulative unique contributors" + extra_text + " with MapComplete - " + str(total) + " contributors")
pyplot.ylabel("Number of unique contributors") pyplot.ylabel("Number of unique contributors")
pyplot.xlabel("Date") pyplot.xlabel("Date")
if show: pyplot.savefig("CumulativeContributors" + extra_text + ".png", dpi=400, facecolor='w', edgecolor='w',
pyplot.show() bbox_inches='tight')
else:
pyplot.savefig("CumulativeContributors" + year + ".png", dpi=400, facecolor='w', edgecolor='w',
bbox_inches='tight')
def create_yearly_usercount_graphs(contents): def create_theme_breakdown(stats, fileExtra="", cutoff=5):
create_usercount_graphs(contents) print("Creating theme breakdown " + fileExtra)
currentYear = datetime.now().year
for year in range(2020, currentYear + 1):
create_usercount_graphs(contents, str(year))
theme_remappings = {
"null": "buurtnatuur",
"metamap": "maps",
"wiki:mapcomplete/fritures": "fritures",
"lits": "lit",
"wiki:user:joost_schouppe/campersite": "campersite",
"wiki-user-joost_schouppe-geveltuintjes": "geveltuintjes",
"wiki-user-joost_schouppe-campersite":"campersites",
"https://raw.githubusercontent.com/osmbe/play/master/mapcomplete/geveltuinen/geveltuinen.json": "geveltuintjes"
}
def create_theme_breakdown(stats, year="", user=None, columnIndex=3):
print("Creating theme breakdown "+year)
themeCounts = {} themeCounts = {}
for row in stats: for row in stats:
if not row[0].startswith(year): theme = row[3].lower()
continue
if user is not None and clean(row[1]) != user:
continue
theme = clean(row[columnIndex]).lower()
if theme in theme_remappings: if theme in theme_remappings:
theme = theme_remappings[theme] theme = theme_remappings[theme]
if theme in themeCounts: if theme in themeCounts:
@ -178,55 +147,157 @@ def create_theme_breakdown(stats, year="", user=None, columnIndex=3):
themeCounts[theme] = 1 themeCounts[theme] = 1
themes = list(themeCounts.items()) themes = list(themeCounts.items())
if len(themes) == 0: if len(themes) == 0:
print("No entries found for user "+user+" in "+year) print("No entries found for theme breakdown (extra: " + str(fileExtra) + ")")
return return
themes.sort(key=lambda kv : kv[1], reverse=True) themes.sort(key=lambda kv: kv[1], reverse=True)
cutoff = 5
if user is not None:
cutoff = 0
other_count = sum([theme[1] for theme in themes if theme[1] < cutoff]) other_count = sum([theme[1] for theme in themes if theme[1] < cutoff])
themes_filtered = [theme for theme in themes if theme[1] >= cutoff] themes_filtered = [theme for theme in themes if theme[1] >= cutoff]
keys = list(map(lambda kv : kv[0] + " (" + str(kv[1])+")", themes_filtered)) keys = list(map(lambda kv: kv[0] + " (" + str(kv[1]) + ")", themes_filtered))
values = list(map(lambda kv : kv[1], themes_filtered)) values = list(map(lambda kv: kv[1], themes_filtered))
total =sum(map(lambda kv:kv[1], themes)) total = sum(map(lambda kv: kv[1], themes))
first_pct = themes[0][1] / total; first_pct = themes[0][1] / total;
if year != "":
year = " in " + year
if other_count > 0: if other_count > 0:
keys.append("other") keys.append("other")
values.append(other_count) values.append(other_count)
pyplot_init() pyplot_init()
pyplot.pie(values, labels=keys, startangle=(90 - 360 * first_pct/2)) pyplot.pie(values, labels=keys, startangle=(90 - 360 * first_pct / 2))
if user is None: pyplot.title("MapComplete changes per theme" + fileExtra + " - " + str(total) + " total changes")
user = "" pyplot.savefig("Theme distribution" + fileExtra + ".png", dpi=400, facecolor='w', edgecolor='w',
else:
user = " by contributor "+user
pyplot.title("MapComplete changes per theme"+year+user+ " - "+str(total)+" total changes")
pyplot.savefig("Theme distribution" + user+year + ".png", dpi=400, facecolor='w', edgecolor='w',
bbox_inches='tight') bbox_inches='tight')
return themes return themes
def gen_theme_breakdown_graphs(contents, user=None):
create_theme_breakdown(contents, "", user) def cumulative_changes_per(contents, index, subject, filenameextra="", cutoff=5, cumulative=True):
print("Creating graph about " + subject + filenameextra)
themes = Hist("date")
dates_per_theme = Hist("theme")
all_themes = set()
for row in contents:
th = row[index]
all_themes.add(th)
themes.add(as_date(row[0]), th)
dates_per_theme.add(th, row[0])
per_theme_count = list(zip(dates_per_theme.keys(), dates_per_theme.map(len)))
# PerThemeCount gives the most popular theme first
per_theme_count.sort(key=lambda kv: kv[1], reverse=False)
values_to_show = [] # (theme name, value to fill between - this is stacked, with the first layer to print last)
running_totals = None
other_total = 0
other_theme_count = 0
other_cumul = None
for kv in per_theme_count:
theme = kv[0]
total_for_this_theme = kv[1]
if cumulative:
edits_per_day_cumul = themes.mapcumul(
lambda themes_for_date: len([x for x in themes_for_date if theme == x]),
lambda a, b: a + b, 0)
else:
edits_per_day_cumul = themes.map(lambda themes_for_date: len([x for x in themes_for_date if theme == x]))
if running_totals is None:
running_totals = edits_per_day_cumul
else:
running_totals = list(map(lambda ab: ab[0] + ab[1], zip(running_totals, edits_per_day_cumul)))
if total_for_this_theme >= cutoff:
values_to_show.append((theme, running_totals))
else:
other_total += total_for_this_theme
other_theme_count += 1
if other_cumul is None:
other_cumul = edits_per_day_cumul
else:
other_cumul = list(map(lambda ab: ab[0] + ab[1], zip(other_cumul, edits_per_day_cumul)))
keys = themes.keys()
values_to_show.reverse()
values_to_show.append(("other", other_cumul))
totals = dict(per_theme_count)
total = sum(totals.values())
totals["other"] = other_total
pyplot_init()
for kv in values_to_show:
if kv[1] is None:
continue # No 'other' graph
msg = kv[0] + " (" + str(totals[kv[0]]) + ")"
if kv[0] == "other":
msg = str(other_theme_count) + " small " + subject + "s (" + str(other_total) + " changes)"
pyplot.fill_between(keys, kv[1], label=msg)
if cumulative:
cumulative_txt = "Cumulative changesets"
else:
cumulative_txt = "Changesets"
pyplot.title(cumulative_txt + " per " + subject + filenameextra + " (" + str(total) + " changesets)")
pyplot.legend(loc="upper left", ncol=3)
pyplot.savefig(cumulative_txt + " per " + subject + filenameextra + ".png")
def contents_where(contents, index, starts_with, invert=False):
for row in contents:
if row[index].startswith(starts_with) is not invert:
yield row
def create_graphs(contents):
create_usercount_graphs(contents)
create_theme_breakdown(contents)
cumulative_changes_per(contents, 3, "theme", cutoff=10)
cumulative_changes_per(contents, 1, "contributor", cutoff=15)
cumulative_changes_per(contents, 2, "language", cutoff=1)
cumulative_changes_per(contents, 4, "version number", cutoff=1)
currentYear = datetime.now().year currentYear = datetime.now().year
for year in range(2020, currentYear + 1): for year in range(2020, currentYear + 1):
create_theme_breakdown(contents, str(year), user) contents_filtered = list(contents_where(contents, 0, str(year)))
extratext = " in " + str(year)
create_usercount_graphs(contents_filtered, extratext)
create_theme_breakdown(contents_filtered, extratext)
cumulative_changes_per(contents_filtered, 3, "theme", extratext, cutoff=5)
cumulative_changes_per(contents_filtered, 1, "contributor", extratext, cutoff=10)
cumulative_changes_per(contents_filtered, 2, "language", extratext, cutoff=1)
cumulative_changes_per(contents_filtered, 4, "version number", extratext, cutoff=1, cumulative=False)
def changes_per_theme_daily(contents):
hist = {} theme_remappings = {
"metamap": "maps",
"groen": "buurtnatuur",
"wiki:mapcomplete/fritures": "fritures",
"wiki:MapComplete/Fritures": "fritures",
"lits": "lit",
"pomp": "cyclofix",
"wiki:user:joost_schouppe/campersite": "campersite",
"wiki-user-joost_schouppe-geveltuintjes": "geveltuintjes",
"wiki-user-joost_schouppe-campersite": "campersites",
"wiki-User-joost_schouppe-campersite": "campersites",
"wiki-User-joost_schouppe-geveltuintjes": "geveltuintjes",
"wiki:User:joost_schouppe/campersite": "campersites",
"https://raw.githubusercontent.com/osmbe/play/master/mapcomplete/geveltuinen/geveltuinen.json": "geveltuintjes"
}
def clean_input(contents):
for row in contents: for row in contents:
theme = row[3].strip().strip("\"").lower()
if theme == "null":
# The theme metadata has only been set later on - we fetch this from the comment
i = row[7].rfind("#")
theme = row[7][i + 1:-1].lower()
if theme in theme_remappings:
theme = theme_remappings[theme]
row[3] = theme
yield [data.strip().strip("\"") for data in row]
def main(): def main():
print("Creating graphs...") print("Creating graphs...")
with open('stats.csv', newline='') as csvfile: with open('stats.csv', newline='') as csvfile:
stats = list(csv.reader(csvfile, delimiter=',', quotechar='"')) stats = list(clean_input(csv.reader(csvfile, delimiter=',', quotechar='"')))
print("Found "+str(len(stats))+" changesets") print("Found " + str(len(stats)) + " changesets")
create_yearly_usercount_graphs(stats) create_graphs(stats)
gen_theme_breakdown_graphs(stats)
print("All done!") print("All done!")

View file

@ -13,7 +13,7 @@ echo "" > tmp.csv
for f in stats.*.json for f in stats.*.json
do do
echo $f echo $f
jq ".features[].properties | [.date, .user, .metadata.language, .metadata.theme, .editor, .create, .modify]" "$f" | tr -d "\n" | sed "s/]\[/\n/g" | tr -d "][" >> tmp.csv jq ".features[].properties | [.date, .user, .metadata.language, .metadata.theme, .editor, .create, .modify, .comment]" "$f" | tr -d "\n" | sed "s/]\[/\n/g" | tr -d "][" >> tmp.csv
echo "" >> tmp.csv echo "" >> tmp.csv
done done

File diff suppressed because it is too large Load diff