Add new python code with mattermost scraping stuff

This commit is contained in:
mcbloch 2022-10-06 22:44:04 +02:00
parent 5f89372558
commit 3c13ec2eae
12 changed files with 585 additions and 171 deletions

8
.gitignore vendored
View file

@ -1,4 +1,10 @@
data/
drive-temp/
cookiefile
db.json
.env
venv/
__pycache__/
.idea
users.toml

1
.tool-versions Normal file
View file

@ -0,0 +1 @@
python 3.10.4

2
Makefile Normal file
View file

@ -0,0 +1,2 @@
web:
python -m hug -f src/web.py

View file

@ -1,22 +0,0 @@
#!/bin/bash
mkdir -p data
echo "Login to CodiMD"
curl -c cookiefile "$CMD_SERVER_URL/login" -X POST -H "Referer: $CMD_SERVER_URL/" --data-raw "email=$CMD_EMAIL&password=$CMD_PASSWORD" >/dev/null
echo
echo
curl -b cookiefile 'https://codimd.zeus.gent/me' | jq
echo
notes_history=$(curl -b cookiefile 'https://codimd.zeus.gent/history')
# echo $notes_history | jq
# note_id=$(echo "$notes_history" | jq -r '.history[1].id')
ids=$(echo "$notes_history" | jq -r '.history | map(.id) | .[]')
while IFS= read -r line; do
echo "... Reading note with ID: $line ..."
curl -b cookiefile "https://codimd.zeus.gent/$line/download" >"data/note-$line.md"
done <<<"$ids"

4
requirements.txt Normal file
View file

@ -0,0 +1,4 @@
tabulate
colored
mattermostdriver
hug

131
src/MattermostObjects.py Normal file
View file

@ -0,0 +1,131 @@
from typing import NamedTuple, List, Dict
class MMUser(NamedTuple):
id: str
create_at: int
update_at: int
delete_at: int
username: str
first_name: str
last_name: str
nickname: str
email: str
auth_data: str
auth_service: str
roles: str
locale: str
timezone: dict
position: any
is_bot: bool = None
bot_description: str = None
email_verified: bool = None
notify_props: dict = None
last_password_update: int = None
failed_attempts: int = None
mfa_active: bool = False
terms_of_service_id: str = None
terms_of_service_create_at: int = None
props: dict = {}
last_picture_update: int = None
@staticmethod
def load(data):
try:
return MMUser(**data)
except TypeError as e:
print("[ERROR] Could not load dict into MMUser namedtuple")
print(str(e))
class MMPostProps(NamedTuple):
from_webhook: str = False
override_icon_url: str = None
override_username: str = None
webhook_display_name: str = None
channel_mentions: Dict = None
matterircd_krcggydky38kdcuubsc7fddc7w: str = None
matterircd_s4ptwhx7wfnx7qwexp1khorh7e: str = None
username: str = None
userId: str = None
old_header: str = None
new_header: str = None
old_purpose: str = None
new_purpose: str = None
old_displayname: str = None
new_displayname: str = None
remove_link_preview: str = None
removedUserId: str = None
addedUserId: str = None
removedUsername: str = None
addedUsername: str = None
message: str = None
attachments: str = None
from_bot: str = False
class MMPost(NamedTuple):
channel_id: str
create_at: int
delete_at: int
edit_at: int
hashtags: str
id: str
is_pinned: bool
message: str
metadata: Dict
original_id: str
parent_id: str
pending_post_id: str
root_id: str
type: str
update_at: int
user_id: str
message_source: str = None
has_reactions: bool = None
file_ids: List[str] = None
props: MMPostProps = None
def from_human(self):
return self.props is None or (
self.props.from_webhook is False and self.props.from_bot is False
)
@staticmethod
def load(data):
try:
props = None
if "props" in data:
try:
props: MMPostProps = MMPostProps(**data["props"])
except TypeError as e:
print("[ERROR] Could not load dict into MMPostProps namedtuple")
print(str(e))
del data["props"]
return MMPost(props=props, **data)
except TypeError as e:
print("[ERROR] Could not load dict into MMPost namedtuple")
print(str(e))
class MMChannelPosts(NamedTuple):
prev_post_id: str
next_post_id: str
order: List[str]
posts: Dict[str, MMPost]
disable_group_highlight: any
reply_count: any
@staticmethod
def load(data):
try:
posts: Dict[str, MMPost] = {
k: MMPost.load(v) for (k, v) in data["posts"].items()
}
del data["posts"]
return MMChannelPosts(posts=posts, **data)
except TypeError as e:
print("[ERROR] Could not load dict into MMUser namedtuple")
print(str(e))

63
src/db.py Normal file
View file

@ -0,0 +1,63 @@
import json
from os.path import exists
from typing import List
db_filename = "db.json"
def init_db():
file_exists = exists(db_filename)
if not file_exists:
print("Initializing json file database")
with open(db_filename, "w") as db_file:
db_file.write("{}")
init_db()
def _load_db():
with open(db_filename, "r") as db_file:
db = json.loads(db_file.read())
return db
def _save_db(db):
with open(db_filename, "w") as db_file:
db_file.write(json.dumps(db))
def get_latest_sync_time() -> int:
db = _load_db()
return db.get("latest_sync_time", 0)
def set_latest_sync_time(le_date) -> None:
db = _load_db()
db["latest_sync_time"] = le_date
_save_db(db)
def add_discovered_file(file_url) -> List[str]:
db = _load_db()
discovered_files = set(db.get("discovered_files", []))
discovered_files.add(file_url)
discovered_files = list(discovered_files)
db["discovered_files"] = discovered_files
_save_db(db)
return discovered_files
def get_discovered_files() -> List[str]:
db = _load_db()
discovered_files = set(db.get("discovered_files", []))
return discovered_files
def add_valid_file(filename, metadata):
db = _load_db()
valid_files = db.get("valid_files", {})
valid_files[filename] = metadata
db["valid_files"] = valid_files
_save_db(db)
return valid_files

35
src/dir_utils.py Normal file
View file

@ -0,0 +1,35 @@
import os
import re
# pattern = re.compile(":::spoiler Gitlab sync([^:]*):::")
pattern = re.compile("[^:]*:::")
def find_metadata(filename):
with open(filename, "r") as file:
print(f"File: {filename}")
data = file.read()
metadata = {}
start_str = ":::spoiler Gitlab sync\n"
end_str = "\n:::"
start_i = data.find(start_str)
if start_i >= 0:
start_i += len(start_str)
end_i = data.find(end_str, start_i + 1)
file_data = data[start_i:end_i]
for line in file_data.split("\n"):
key_index = 2
value_index = line.find(": ")
key = line[key_index:value_index]
value = line[value_index + 2 :]
metadata[key] = value
print("Valid report")
print(metadata)
else:
print("Not a valid report")
return None
return metadata

181
src/mattermost.py Normal file
View file

@ -0,0 +1,181 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import pprint as pp
from abc import ABC, abstractmethod
from datetime import datetime
from enum import Enum
from time import sleep
from typing import Dict, List
import toml
from colored import style
from mattermostdriver import Driver
from tabulate import tabulate
from MattermostObjects import MMUser, MMPost, MMChannelPosts
from utils import humanize_date_difference, timer
pp = pp.PrettyPrinter(indent=2)
class LogLevel(Enum):
INFO = "INFO"
ERROR = "ERROR"
class User(ABC):
@abstractmethod
def credentials_dict(self) -> dict:
pass
class NormalUser(User):
def __init__(self, login_id, password):
self.login_id = login_id
self.password = password
def credentials_dict(self) -> dict:
return {"login_id": self.login_id, "password": self.password}
def __repr__(self):
return "User<name: {}, password: ******>".format(self.login_id)
class TokenUser(User):
def __init__(self, token):
self.token = token
def credentials_dict(self) -> dict:
return {"token": self.token}
def __repr__(self):
return "TokenUser<token: {}>".format(self.token)
users: {str: [User]} = {}
def loadusers():
with open("users.toml") as f:
usersstring = f.read()
usersdict = toml.loads(usersstring)
usr = None
for name, data in usersdict.items():
if "token" in data:
usr = TokenUser(token=data["token"])
elif "name" in data and "password" in data:
usr = NormalUser(login_id=data["name"], password=data["password"])
else:
print("Invalid user '{}' in toml file".format(name))
exit(1)
users[name] = usr
loadusers()
def merge_dict(a: dict, b: dict) -> dict:
return {**a, **b}
class MMApi(Driver):
def __init__(self, user: User = users["flynn"]):
Driver.__init__(
self,
merge_dict(
{
"url": "mattermost.zeus.gent",
"port": 443,
"debug": False,
},
user.credentials_dict(),
),
)
self.login()
self.user_id = self.users.get_user(user_id="me")["id"]
self.team_id = self.teams.get_team_by_name("zeus")["id"]
print(f" = Creating mattermost client")
print(f" = - User: {self.user_id}")
print(f" = - Team: {self.team_id}")
@staticmethod
def print_response(resp, title="Response"):
print("--------")
print(style.BOLD + title + style.RESET)
pp.pprint(resp)
def log(self, text: str, log_level: LogLevel = LogLevel.INFO):
print(f"{style.BOLD}[{log_level.value}]{style.RESET} {text}")
def get_channel_id(self, channel_name):
resp = self.channels.get_channel_by_name(self.team_id, channel_name, since)
id = resp["id"]
self.log(f"Fetching channel id for {channel_name}: {id}")
return id
@timer
def get_posts_for_channel(self, channel_id, since):
print(f"Fetching posts for {channel_id} since {since}")
page_size = 200
page_i = 0
data = {}
more = True
while more:
resp = self.posts.get_posts_for_channel(
channel_id,
params={"page": page_i, "per_page": page_size, "since": since},
)
page_i += 1
print(f"Fetching page {page_i}")
# print("-", end=" ")
paged_data = resp["posts"]
paged_count = len(paged_data)
if since != 0:
# The mattermost api is absolutely retarted
# If you add the since parameter and it's different then 0 it will give you 1000 posts max.
# It will not respect you page_index or page_size.
more = False
else:
if paged_count < page_size:
more = False
# Transform the data into something more sensible or practical
if type(paged_data) is list:
paged_data = {item["id"]: item for item in paged_data}
# Append the paged_data to our global data variable
data = {**data, **paged_data}
print()
self.log(f"Post count: {len(data)}")
return data
class ChannelApi(MMApi):
def __init__(self, channel_name, user=None):
MMApi.__init__(self, user)
self.channel_id = self.get_channel_id(channel_name)
def create_post(self, message: str, props: Dict = None) -> None:
resp = self.posts.create_post(
options={"channel_id": self.channel_id, "message": message, "props": props}
)
self.log(f'Message successfully created: "{message}"')
# print_response("Create post", resp)
if __name__ == "__main__":
foo = MMApi(user=users["flynn"])
# all_posts = foo.get_all_posts()
channel = foo.channels.get_channel_by_name(
foo.team_id,
"bestuur",
)
channel_id = channel["id"]
resp = foo.posts.get_posts_for_channel(channel_id, params={"per_page": 200})
channel_posts: MMChannelPosts = MMChannelPosts.load(resp)

75
src/utils.py Normal file
View file

@ -0,0 +1,75 @@
import datetime
import functools
import time
def timer(func):
"""Print the runtime of the decorated function"""
@functools.wraps(func)
def wrapper_timer(*args, **kwargs):
start_time = time.perf_counter() # 1
value = func(*args, **kwargs)
end_time = time.perf_counter() # 2
run_time = end_time - start_time # 3
print(f"Finished {func.__name__!r} in {run_time:.4f} secs")
return value
return wrapper_timer
def humanize_date_difference(
older: datetime, newer: datetime = None, offset: int = None, debug=False
):
if newer:
dt = newer - older
milliseconds = dt.microseconds / 1e3
offset = milliseconds + (dt.seconds * 1000) + (dt.days * 1000 * 60 * 60 * 24)
if offset:
if debug:
print(f"{offset} s offset")
delta_ms = int(offset % 1000)
offset /= 1e3
delta_s = int(offset % 60)
offset /= 60
delta_m = int(offset % 60)
offset /= 60
delta_h = int(offset % 24)
offset /= 24
delta_d = int(offset)
if debug:
print("{:d} ms".format(delta_ms))
print("{:d} s".format(delta_s))
print("{:d} m".format(delta_m))
print("{:d} h".format(delta_h))
print("{:d} d".format(delta_d))
else:
raise ValueError("Must supply otherdate or offset (from now)")
if delta_d > 1:
if delta_d > 6:
date = older + datetime.timedelta(
days=-delta_d, hours=-delta_h, minutes=-delta_m
)
return date.strftime("%A, %Y %B %m, %H:%I")
else:
wday = older + datetime.timedelta(days=-delta_d)
return wday.strftime("%A")
if delta_d == 1:
return "Yesterday"
if delta_h > 0:
return "{:.0f}h {:.0f}m ago".format(delta_h, delta_m)
if delta_m > 0:
return "{:.0f}m {:.0f}s ago".format(delta_m, delta_s)
if delta_s > 0:
return "{:.0f}s ago".format(delta_s)
else:
return "{:.0f} ms ago".format(delta_ms)
if __name__ == "__main__":
date1 = datetime.datetime.now()
date2 = datetime.datetime.now() - datetime.timedelta(milliseconds=20)
print(humanize_date_difference(date2, date1, debug=True))

86
src/web.py Normal file
View file

@ -0,0 +1,86 @@
import json
import os
import re
import time
import hug
import requests
import db
import dir_utils
from mattermost import MMApi
def find_codimd_files_on_mattermost():
mattermost = MMApi()
channels = [
"hrx6pgfswjbttcj8nim3jrwe7w", # bestuur-INTERN
"uda7ax9poprduq8ob56e1fqk4e", # bestuur
]
last_fetch_time = db.get_latest_sync_time()
current_fetch_time = int(time.time() * 1000)
print(f"Fetching posts since: {last_fetch_time}")
for channel_id in channels:
print(f"Fetching posts for channel_id: {channel_id}")
# TODO Use first statement for all posts
posts = mattermost.get_posts_for_channel(channel_id, last_fetch_time)
# posts = mattermost.posts.get_posts_for_channel(channel_id)
print(f"Scraping {len(posts)} posts")
for post_id, post in posts.items():
urls = re.findall(r"(https?://[^\s#?]+)", post["message"])
for url in urls:
idx = url.find("codimd.zeus.gent")
if idx == -1:
# In this case it's an url but not for codimd
continue
print(url)
db.add_discovered_file(url)
# When everything succeeded. Save the current unix time as latest fetched moment
db.set_latest_sync_time(current_fetch_time)
def read_note(url):
return requests.get(f"{url}/download").text
def download_files():
for url in db.get_discovered_files():
with open(f'data/note-{url[url.rfind("/") + 1:]}.md', "w") as f:
print(f"Downloading url {url}")
f.write(read_note(url))
def validate_downloaded_files():
path = "data"
dir_list = os.listdir(path)
for filename in dir_list:
metadata = dir_utils.find_metadata("data/" + filename)
if metadata is not None:
db.add_valid_file(filename, metadata)
return db._load_db()
@hug.get("/sync-mattermost")
def sync_mattermost():
print()
print("=======================================")
print("== Finding urls posted on mattermost ==")
find_codimd_files_on_mattermost()
print()
print("=============================")
print("== Downloading found files ==")
download_files()
print()
print("================================================")
print("== Finding valid files in the downloaded ones ==")
validate_downloaded_files()
print()
return db._load_db()

View file

@ -1,148 +0,0 @@
#!/usr/bin/env bash
set -euo pipefail
REPO_FOLDER="drive-temp"
function clone_repo() {
mkdir -p "$REPO_FOLDER"
cd "$REPO_FOLDER"
inside_git_repo="$(git rev-parse --is-inside-work-tree 2>/dev/null || true)"
if [ ! "$inside_git_repo" ]; then
git init
git remote add origin "https://$GITLAB_ACCESS_TOKEN_NAME:$GITLAB_ACCESS_TOKEN@git.zeus.gent/bestuur/drive.git"
git config user.email "codimd.zeus.gent@mcbloch.dev"
git config user.name "CodiMD sync bot"
git pull origin master
else
echo "> Git repo already initialized, skipping"
fi
git fetch -a
cd ..
}
function clear_repo() {
git restore .
git checkout -- .
}
function checkout_branch() {
branch_name=$1
# Start from master
git checkout master
# Now go to the correct branch name
if ! git checkout -b "$branch_name" >/dev/null; then
echo "> Checkout existing branch"
git checkout "$branch_name" >/dev/null
else
echo "> Created new branch"
fi
if git branch --set-upstream-to="origin/$branch_name" "$branch_name"; then # >/dev/null
git pull
fi
}
function sync_file() {
note_name=$1
branch_name="codimd-sync_$sync_to"
echo "> Starting sync of $note_name"
clear_repo
checkout_branch "$branch_name"
echo "> Copy the note to $sync_to"
cp "../data/$note_name" "$sync_to"
git add "$sync_to"
if ! git commit -m "[bot] Update Gitlab with latest CodiMD file version"; then
#echo "> No changes in our file."
:
else
#echo "> Changes in our file, committing."
:
fi
git push -u origin "$branch_name"
#MR_NAME="[CodiMD Sync] $note_name"
#echo "> Checking if pr with name '$MR_NAME' already exists"
# mrs=$(curl --header "PRIVATE-TOKEN: $GITLAB_ACCESS_TOKEN" "https://git.zeus.gent/api/v4/projects/$GITLAB_PROJECT_ID/merge_requests?labels=codimd-sync" | jq -e 'select(type == "array" and length == 0)' )
# echo $mrs | jq -e 'select(type == "array" and length == 0)'
# Check if a MR is already made (open or merged)
echo "> Checking if the branch differs from master"
echo "> If so a new pr should be created to push our work"
echo "> If an open pr already exists, pass"
echo
diff_lines=$(git diff "origin/master..$branch_name" | wc -l)
if [ "$diff_lines" == "0" ]; then
echo "> Branch has no changes compared to master."
else
echo "> Branch has changes"
if (glab mr list --all --source-branch="$branch_name" | grep "No merge requests match your search"); then
echo "> No matching Merge Request found at all"
glab mr create --label codimd-sync -t "[CodiMD sync] Add document $sync_to" --fill --yes
cd ..
rm -rf drive-temp
else
echo "> Matching Merge Request found"
echo "> Making sure it is an open one"
if (glab mr list --source-branch="$branch_name" | grep "No open merge requests match your search"); then
echo "No open merge request found"
glab mr create --label codimd-sync -t "[CodiMD sync] Update document $sync_to" --fill --yes
else
echo "Open merge request found."
fi
fi
fi
}
function sync_files() {
cd data
for note_name in *.md; do
echo
echo "> ======================"
echo "> Analyzing $note_name"
# Extract the sync-to path
sync_to=$(sed -n -e '/:::spoiler Gitlab sync/,/:::/ p' "$note_name" | grep "sync-to" | cut -d":" -f2 | xargs || true)
if [ "$sync_to" == "" ]; then
# echo "> No metadata found, skip"
:
else
echo "> Found a requested sync to: $sync_to"
cd ../$REPO_FOLDER
sync_file "$note_name"
cd ../data
fi
done
}
glab auth login --hostname git.zeus.gent --token "$GITLAB_TOKEN"
# A one time operation to clone the repo.
clone_repo
# Loop over the files in the data folder and sync them to Gitlab via Merge Requests
sync_files
exit 0
# https://git.zeus.gent/bestuur/drive
# GITLAB_PROJECT_ID=2