From 3c13ec2eae502274ecb2c7f09d9ee372f954f87d Mon Sep 17 00:00:00 2001 From: mcbloch Date: Thu, 6 Oct 2022 22:44:04 +0200 Subject: [PATCH] Add new python code with mattermost scraping stuff --- .gitignore | 8 +- .tool-versions | 1 + Makefile | 2 + fetch_notes.sh | 22 ----- requirements.txt | 4 + src/MattermostObjects.py | 131 ++++++++++++++++++++++++++++ src/db.py | 63 ++++++++++++++ src/dir_utils.py | 35 ++++++++ src/mattermost.py | 181 +++++++++++++++++++++++++++++++++++++++ src/utils.py | 75 ++++++++++++++++ src/web.py | 86 +++++++++++++++++++ sync_notes.sh | 148 -------------------------------- 12 files changed, 585 insertions(+), 171 deletions(-) create mode 100644 .tool-versions create mode 100644 Makefile delete mode 100755 fetch_notes.sh create mode 100644 requirements.txt create mode 100644 src/MattermostObjects.py create mode 100644 src/db.py create mode 100644 src/dir_utils.py create mode 100644 src/mattermost.py create mode 100644 src/utils.py create mode 100644 src/web.py delete mode 100755 sync_notes.sh diff --git a/.gitignore b/.gitignore index c195d28..28a1000 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,10 @@ data/ drive-temp/ -cookiefile +db.json .env + +venv/ +__pycache__/ +.idea +users.toml + diff --git a/.tool-versions b/.tool-versions new file mode 100644 index 0000000..9dc2289 --- /dev/null +++ b/.tool-versions @@ -0,0 +1 @@ +python 3.10.4 \ No newline at end of file diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..2c8a8db --- /dev/null +++ b/Makefile @@ -0,0 +1,2 @@ +web: + python -m hug -f src/web.py \ No newline at end of file diff --git a/fetch_notes.sh b/fetch_notes.sh deleted file mode 100755 index 54f623d..0000000 --- a/fetch_notes.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/bash - -mkdir -p data - -echo "Login to CodiMD" -curl -c cookiefile "$CMD_SERVER_URL/login" -X POST -H "Referer: $CMD_SERVER_URL/" --data-raw "email=$CMD_EMAIL&password=$CMD_PASSWORD" >/dev/null - -echo -echo - -curl -b cookiefile 'https://codimd.zeus.gent/me' | jq -echo -notes_history=$(curl -b cookiefile 'https://codimd.zeus.gent/history') - -# echo $notes_history | jq -# note_id=$(echo "$notes_history" | jq -r '.history[1].id') -ids=$(echo "$notes_history" | jq -r '.history | map(.id) | .[]') - -while IFS= read -r line; do - echo "... Reading note with ID: $line ..." - curl -b cookiefile "https://codimd.zeus.gent/$line/download" >"data/note-$line.md" -done <<<"$ids" diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..b8573c9 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,4 @@ +tabulate +colored +mattermostdriver +hug \ No newline at end of file diff --git a/src/MattermostObjects.py b/src/MattermostObjects.py new file mode 100644 index 0000000..f1d7c25 --- /dev/null +++ b/src/MattermostObjects.py @@ -0,0 +1,131 @@ +from typing import NamedTuple, List, Dict + + +class MMUser(NamedTuple): + id: str + create_at: int + update_at: int + delete_at: int + username: str + first_name: str + last_name: str + nickname: str + email: str + auth_data: str + auth_service: str + roles: str + locale: str + timezone: dict + position: any + + is_bot: bool = None + bot_description: str = None + email_verified: bool = None + notify_props: dict = None + last_password_update: int = None + failed_attempts: int = None + mfa_active: bool = False + terms_of_service_id: str = None + terms_of_service_create_at: int = None + props: dict = {} + last_picture_update: int = None + + @staticmethod + def load(data): + try: + return MMUser(**data) + except TypeError as e: + print("[ERROR] Could not load dict into MMUser namedtuple") + print(str(e)) + + +class MMPostProps(NamedTuple): + from_webhook: str = False + override_icon_url: str = None + override_username: str = None + webhook_display_name: str = None + + channel_mentions: Dict = None + matterircd_krcggydky38kdcuubsc7fddc7w: str = None + matterircd_s4ptwhx7wfnx7qwexp1khorh7e: str = None + username: str = None + userId: str = None + old_header: str = None + new_header: str = None + old_purpose: str = None + new_purpose: str = None + old_displayname: str = None + new_displayname: str = None + remove_link_preview: str = None + removedUserId: str = None + addedUserId: str = None + removedUsername: str = None + addedUsername: str = None + message: str = None + attachments: str = None + from_bot: str = False + + +class MMPost(NamedTuple): + channel_id: str + create_at: int + delete_at: int + edit_at: int + hashtags: str + id: str + is_pinned: bool + message: str + metadata: Dict + original_id: str + parent_id: str + pending_post_id: str + root_id: str + type: str + update_at: int + user_id: str + message_source: str = None + has_reactions: bool = None + file_ids: List[str] = None + props: MMPostProps = None + + def from_human(self): + return self.props is None or ( + self.props.from_webhook is False and self.props.from_bot is False + ) + + @staticmethod + def load(data): + try: + props = None + if "props" in data: + try: + props: MMPostProps = MMPostProps(**data["props"]) + except TypeError as e: + print("[ERROR] Could not load dict into MMPostProps namedtuple") + print(str(e)) + del data["props"] + return MMPost(props=props, **data) + except TypeError as e: + print("[ERROR] Could not load dict into MMPost namedtuple") + print(str(e)) + + +class MMChannelPosts(NamedTuple): + prev_post_id: str + next_post_id: str + order: List[str] + posts: Dict[str, MMPost] + disable_group_highlight: any + reply_count: any + + @staticmethod + def load(data): + try: + posts: Dict[str, MMPost] = { + k: MMPost.load(v) for (k, v) in data["posts"].items() + } + del data["posts"] + return MMChannelPosts(posts=posts, **data) + except TypeError as e: + print("[ERROR] Could not load dict into MMUser namedtuple") + print(str(e)) diff --git a/src/db.py b/src/db.py new file mode 100644 index 0000000..4434e37 --- /dev/null +++ b/src/db.py @@ -0,0 +1,63 @@ +import json +from os.path import exists +from typing import List + +db_filename = "db.json" + + +def init_db(): + file_exists = exists(db_filename) + if not file_exists: + print("Initializing json file database") + with open(db_filename, "w") as db_file: + db_file.write("{}") + + +init_db() + + +def _load_db(): + with open(db_filename, "r") as db_file: + db = json.loads(db_file.read()) + return db + + +def _save_db(db): + with open(db_filename, "w") as db_file: + db_file.write(json.dumps(db)) + + +def get_latest_sync_time() -> int: + db = _load_db() + return db.get("latest_sync_time", 0) + + +def set_latest_sync_time(le_date) -> None: + db = _load_db() + db["latest_sync_time"] = le_date + _save_db(db) + + +def add_discovered_file(file_url) -> List[str]: + db = _load_db() + discovered_files = set(db.get("discovered_files", [])) + discovered_files.add(file_url) + discovered_files = list(discovered_files) + db["discovered_files"] = discovered_files + _save_db(db) + return discovered_files + + +def get_discovered_files() -> List[str]: + db = _load_db() + discovered_files = set(db.get("discovered_files", [])) + return discovered_files + + +def add_valid_file(filename, metadata): + db = _load_db() + valid_files = db.get("valid_files", {}) + valid_files[filename] = metadata + db["valid_files"] = valid_files + _save_db(db) + return valid_files diff --git a/src/dir_utils.py b/src/dir_utils.py new file mode 100644 index 0000000..ae1b4eb --- /dev/null +++ b/src/dir_utils.py @@ -0,0 +1,35 @@ +import os +import re + +# pattern = re.compile(":::spoiler Gitlab sync([^:]*):::") +pattern = re.compile("[^:]*:::") + + +def find_metadata(filename): + with open(filename, "r") as file: + print(f"File: {filename}") + data = file.read() + + metadata = {} + + start_str = ":::spoiler Gitlab sync\n" + end_str = "\n:::" + + start_i = data.find(start_str) + if start_i >= 0: + start_i += len(start_str) + end_i = data.find(end_str, start_i + 1) + file_data = data[start_i:end_i] + for line in file_data.split("\n"): + key_index = 2 + value_index = line.find(": ") + key = line[key_index:value_index] + value = line[value_index + 2 :] + metadata[key] = value + print("Valid report") + print(metadata) + else: + print("Not a valid report") + return None + + return metadata diff --git a/src/mattermost.py b/src/mattermost.py new file mode 100644 index 0000000..5e7b3e6 --- /dev/null +++ b/src/mattermost.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +import pprint as pp +from abc import ABC, abstractmethod +from datetime import datetime +from enum import Enum +from time import sleep +from typing import Dict, List + +import toml +from colored import style +from mattermostdriver import Driver +from tabulate import tabulate + +from MattermostObjects import MMUser, MMPost, MMChannelPosts +from utils import humanize_date_difference, timer + +pp = pp.PrettyPrinter(indent=2) + + +class LogLevel(Enum): + INFO = "INFO" + ERROR = "ERROR" + + +class User(ABC): + @abstractmethod + def credentials_dict(self) -> dict: + pass + + +class NormalUser(User): + def __init__(self, login_id, password): + self.login_id = login_id + self.password = password + + def credentials_dict(self) -> dict: + return {"login_id": self.login_id, "password": self.password} + + def __repr__(self): + return "User".format(self.login_id) + + +class TokenUser(User): + def __init__(self, token): + self.token = token + + def credentials_dict(self) -> dict: + return {"token": self.token} + + def __repr__(self): + return "TokenUser".format(self.token) + + +users: {str: [User]} = {} + + +def loadusers(): + with open("users.toml") as f: + usersstring = f.read() + usersdict = toml.loads(usersstring) + + usr = None + for name, data in usersdict.items(): + if "token" in data: + usr = TokenUser(token=data["token"]) + elif "name" in data and "password" in data: + usr = NormalUser(login_id=data["name"], password=data["password"]) + else: + print("Invalid user '{}' in toml file".format(name)) + exit(1) + users[name] = usr + + +loadusers() + + +def merge_dict(a: dict, b: dict) -> dict: + return {**a, **b} + + +class MMApi(Driver): + def __init__(self, user: User = users["flynn"]): + Driver.__init__( + self, + merge_dict( + { + "url": "mattermost.zeus.gent", + "port": 443, + "debug": False, + }, + user.credentials_dict(), + ), + ) + self.login() + self.user_id = self.users.get_user(user_id="me")["id"] + self.team_id = self.teams.get_team_by_name("zeus")["id"] + print(f" = Creating mattermost client") + print(f" = - User: {self.user_id}") + print(f" = - Team: {self.team_id}") + + @staticmethod + def print_response(resp, title="Response"): + print("--------") + print(style.BOLD + title + style.RESET) + pp.pprint(resp) + + def log(self, text: str, log_level: LogLevel = LogLevel.INFO): + print(f"{style.BOLD}[{log_level.value}]{style.RESET} {text}") + + def get_channel_id(self, channel_name): + resp = self.channels.get_channel_by_name(self.team_id, channel_name, since) + id = resp["id"] + self.log(f"Fetching channel id for {channel_name}: {id}") + return id + + @timer + def get_posts_for_channel(self, channel_id, since): + print(f"Fetching posts for {channel_id} since {since}") + page_size = 200 + page_i = 0 + data = {} + more = True + while more: + resp = self.posts.get_posts_for_channel( + channel_id, + params={"page": page_i, "per_page": page_size, "since": since}, + ) + page_i += 1 + print(f"Fetching page {page_i}") + # print("-", end=" ") + + paged_data = resp["posts"] + paged_count = len(paged_data) + + if since != 0: + # The mattermost api is absolutely retarted + # If you add the since parameter and it's different then 0 it will give you 1000 posts max. + # It will not respect you page_index or page_size. + more = False + else: + if paged_count < page_size: + more = False + + # Transform the data into something more sensible or practical + if type(paged_data) is list: + paged_data = {item["id"]: item for item in paged_data} + + # Append the paged_data to our global data variable + data = {**data, **paged_data} + print() + + self.log(f"Post count: {len(data)}") + return data + + +class ChannelApi(MMApi): + def __init__(self, channel_name, user=None): + MMApi.__init__(self, user) + self.channel_id = self.get_channel_id(channel_name) + + def create_post(self, message: str, props: Dict = None) -> None: + resp = self.posts.create_post( + options={"channel_id": self.channel_id, "message": message, "props": props} + ) + self.log(f'Message successfully created: "{message}"') + # print_response("Create post", resp) + + +if __name__ == "__main__": + foo = MMApi(user=users["flynn"]) + + # all_posts = foo.get_all_posts() + + channel = foo.channels.get_channel_by_name( + foo.team_id, + "bestuur", + ) + channel_id = channel["id"] + resp = foo.posts.get_posts_for_channel(channel_id, params={"per_page": 200}) + channel_posts: MMChannelPosts = MMChannelPosts.load(resp) diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..10f7cb1 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,75 @@ +import datetime +import functools +import time + + +def timer(func): + """Print the runtime of the decorated function""" + + @functools.wraps(func) + def wrapper_timer(*args, **kwargs): + start_time = time.perf_counter() # 1 + value = func(*args, **kwargs) + end_time = time.perf_counter() # 2 + run_time = end_time - start_time # 3 + print(f"Finished {func.__name__!r} in {run_time:.4f} secs") + return value + + return wrapper_timer + + +def humanize_date_difference( + older: datetime, newer: datetime = None, offset: int = None, debug=False +): + if newer: + dt = newer - older + milliseconds = dt.microseconds / 1e3 + offset = milliseconds + (dt.seconds * 1000) + (dt.days * 1000 * 60 * 60 * 24) + if offset: + if debug: + print(f"{offset} s offset") + + delta_ms = int(offset % 1000) + offset /= 1e3 + delta_s = int(offset % 60) + offset /= 60 + delta_m = int(offset % 60) + offset /= 60 + delta_h = int(offset % 24) + offset /= 24 + delta_d = int(offset) + + if debug: + print("{:d} ms".format(delta_ms)) + print("{:d} s".format(delta_s)) + print("{:d} m".format(delta_m)) + print("{:d} h".format(delta_h)) + print("{:d} d".format(delta_d)) + else: + raise ValueError("Must supply otherdate or offset (from now)") + + if delta_d > 1: + if delta_d > 6: + date = older + datetime.timedelta( + days=-delta_d, hours=-delta_h, minutes=-delta_m + ) + return date.strftime("%A, %Y %B %m, %H:%I") + else: + wday = older + datetime.timedelta(days=-delta_d) + return wday.strftime("%A") + if delta_d == 1: + return "Yesterday" + if delta_h > 0: + return "{:.0f}h {:.0f}m ago".format(delta_h, delta_m) + if delta_m > 0: + return "{:.0f}m {:.0f}s ago".format(delta_m, delta_s) + if delta_s > 0: + return "{:.0f}s ago".format(delta_s) + else: + return "{:.0f} ms ago".format(delta_ms) + + +if __name__ == "__main__": + date1 = datetime.datetime.now() + date2 = datetime.datetime.now() - datetime.timedelta(milliseconds=20) + print(humanize_date_difference(date2, date1, debug=True)) diff --git a/src/web.py b/src/web.py new file mode 100644 index 0000000..c723d5c --- /dev/null +++ b/src/web.py @@ -0,0 +1,86 @@ +import json +import os +import re +import time + +import hug +import requests + +import db +import dir_utils +from mattermost import MMApi + + +def find_codimd_files_on_mattermost(): + mattermost = MMApi() + channels = [ + "hrx6pgfswjbttcj8nim3jrwe7w", # bestuur-INTERN + "uda7ax9poprduq8ob56e1fqk4e", # bestuur + ] + + last_fetch_time = db.get_latest_sync_time() + current_fetch_time = int(time.time() * 1000) + print(f"Fetching posts since: {last_fetch_time}") + for channel_id in channels: + print(f"Fetching posts for channel_id: {channel_id}") + + # TODO Use first statement for all posts + posts = mattermost.get_posts_for_channel(channel_id, last_fetch_time) + # posts = mattermost.posts.get_posts_for_channel(channel_id) + + print(f"Scraping {len(posts)} posts") + for post_id, post in posts.items(): + urls = re.findall(r"(https?://[^\s#?]+)", post["message"]) + for url in urls: + idx = url.find("codimd.zeus.gent") + if idx == -1: + # In this case it's an url but not for codimd + continue + + print(url) + db.add_discovered_file(url) + + # When everything succeeded. Save the current unix time as latest fetched moment + db.set_latest_sync_time(current_fetch_time) + + +def read_note(url): + return requests.get(f"{url}/download").text + + +def download_files(): + for url in db.get_discovered_files(): + with open(f'data/note-{url[url.rfind("/") + 1:]}.md', "w") as f: + print(f"Downloading url {url}") + f.write(read_note(url)) + + +def validate_downloaded_files(): + path = "data" + dir_list = os.listdir(path) + + for filename in dir_list: + metadata = dir_utils.find_metadata("data/" + filename) + if metadata is not None: + db.add_valid_file(filename, metadata) + + return db._load_db() + + +@hug.get("/sync-mattermost") +def sync_mattermost(): + print() + print("=======================================") + print("== Finding urls posted on mattermost ==") + find_codimd_files_on_mattermost() + print() + print("=============================") + print("== Downloading found files ==") + download_files() + print() + print("================================================") + print("== Finding valid files in the downloaded ones ==") + validate_downloaded_files() + print() + + return db._load_db() diff --git a/sync_notes.sh b/sync_notes.sh deleted file mode 100755 index 76032e1..0000000 --- a/sync_notes.sh +++ /dev/null @@ -1,148 +0,0 @@ -#!/usr/bin/env bash - -set -euo pipefail - -REPO_FOLDER="drive-temp" - -function clone_repo() { - mkdir -p "$REPO_FOLDER" - cd "$REPO_FOLDER" - - inside_git_repo="$(git rev-parse --is-inside-work-tree 2>/dev/null || true)" - if [ ! "$inside_git_repo" ]; then - git init - git remote add origin "https://$GITLAB_ACCESS_TOKEN_NAME:$GITLAB_ACCESS_TOKEN@git.zeus.gent/bestuur/drive.git" - git config user.email "codimd.zeus.gent@mcbloch.dev" - git config user.name "CodiMD sync bot" - git pull origin master - else - echo "> Git repo already initialized, skipping" - fi - git fetch -a - - cd .. -} - -function clear_repo() { - git restore . - git checkout -- . -} - -function checkout_branch() { - branch_name=$1 - - # Start from master - git checkout master - - # Now go to the correct branch name - if ! git checkout -b "$branch_name" >/dev/null; then - echo "> Checkout existing branch" - git checkout "$branch_name" >/dev/null - else - echo "> Created new branch" - fi - - if git branch --set-upstream-to="origin/$branch_name" "$branch_name"; then # >/dev/null - git pull - fi -} - -function sync_file() { - note_name=$1 - branch_name="codimd-sync_$sync_to" - - echo "> Starting sync of $note_name" - - clear_repo - checkout_branch "$branch_name" - - echo "> Copy the note to $sync_to" - cp "../data/$note_name" "$sync_to" - - git add "$sync_to" - if ! git commit -m "[bot] Update Gitlab with latest CodiMD file version"; then - #echo "> No changes in our file." - : - else - #echo "> Changes in our file, committing." - : - fi - - git push -u origin "$branch_name" - - #MR_NAME="[CodiMD Sync] $note_name" - #echo "> Checking if pr with name '$MR_NAME' already exists" - - # mrs=$(curl --header "PRIVATE-TOKEN: $GITLAB_ACCESS_TOKEN" "https://git.zeus.gent/api/v4/projects/$GITLAB_PROJECT_ID/merge_requests?labels=codimd-sync" | jq -e 'select(type == "array" and length == 0)' ) - - # echo $mrs | jq -e 'select(type == "array" and length == 0)' - - # Check if a MR is already made (open or merged) - echo "> Checking if the branch differs from master" - echo "> If so a new pr should be created to push our work" - echo "> If an open pr already exists, pass" - echo - - diff_lines=$(git diff "origin/master..$branch_name" | wc -l) - if [ "$diff_lines" == "0" ]; then - echo "> Branch has no changes compared to master." - else - echo "> Branch has changes" - - if (glab mr list --all --source-branch="$branch_name" | grep "No merge requests match your search"); then - echo "> No matching Merge Request found at all" - - glab mr create --label codimd-sync -t "[CodiMD sync] Add document $sync_to" --fill --yes - - cd .. - rm -rf drive-temp - else - echo "> Matching Merge Request found" - echo "> Making sure it is an open one" - - if (glab mr list --source-branch="$branch_name" | grep "No open merge requests match your search"); then - echo "No open merge request found" - glab mr create --label codimd-sync -t "[CodiMD sync] Update document $sync_to" --fill --yes - else - echo "Open merge request found." - fi - fi - - fi -} - -function sync_files() { - cd data - for note_name in *.md; do - echo - echo "> ======================" - echo "> Analyzing $note_name" - - # Extract the sync-to path - sync_to=$(sed -n -e '/:::spoiler Gitlab sync/,/:::/ p' "$note_name" | grep "sync-to" | cut -d":" -f2 | xargs || true) - - if [ "$sync_to" == "" ]; then - # echo "> No metadata found, skip" - : - else - echo "> Found a requested sync to: $sync_to" - cd ../$REPO_FOLDER - sync_file "$note_name" - cd ../data - fi - - done -} - -glab auth login --hostname git.zeus.gent --token "$GITLAB_TOKEN" - -# A one time operation to clone the repo. -clone_repo - -# Loop over the files in the data folder and sync them to Gitlab via Merge Requests -sync_files - -exit 0 - -# https://git.zeus.gent/bestuur/drive -# GITLAB_PROJECT_ID=2