import json import os import re import time import traceback from pprint import pprint import hug import requests import db import dir_utils import mattermost_client import mattermost_communication import sync_gitea as sync from config import config from utils import id_to_url, url_to_id def find_codimd_files_on_mattermost(): mattermost = mattermost_client.MMApi() channels = config["mattermost"]["scrape_channel_ids"] last_fetch_time = db.get_latest_sync_time() current_fetch_time = int(time.time() * 1000) print(f"Fetching posts since: {last_fetch_time}") for channel_id in channels: print(f"Fetching posts for channel_id: {channel_id}") # TODO Use first statement for all posts posts = mattermost.get_posts_for_channel(channel_id, last_fetch_time) # posts = mattermost.posts.get_posts_for_channel(channel_id) print(f"Scraping {len(posts)} posts") for post_id, post in posts.items(): # old: r"(https?://[^\s#?]+)" url_regex = r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)" urls = re.findall(url_regex, post["message"]) for url in urls: idx = url.find("codimd.zeus.gent") if idx == -1: # In this case it's an url but not for codimd continue # Remove everything after the # of ? for char in ["#", "?"]: cut_idx = url.rfind(char) if url.rfind(char) != -1 else len(url) url = url[:cut_idx] # pprint(post) print(url) db.add_discovered_url(url, post) # When everything succeeded. Save the current unix time as latest fetched moment db.set_latest_sync_time(current_fetch_time) def read_note(url): return requests.get(f"{url}/download").text def download_files(): for file_id, file_info in db.get_files().items(): local_file_path = f"data/note-{file_id}.md" url = file_info["source_url"] with open(local_file_path, "w") as f: print(f"Downloading url {url}") f.write(read_note(url)) db.set_local_file_path(file_id, local_file_path) def validate_downloaded_files(post_mattermost_hint=True): path = "data" dir_list = os.listdir(path) for file_id, file_info in db.get_files().items(): file_path = file_info["local_file_path"] metadata = dir_utils.find_metadata(file_path) if metadata is not None: is_new_file, new_file_info = db.mark_file_valid(file_id, metadata) if is_new_file: mattermost_communication.report_newly_found_file(file_id, new_file_info) else: changed, new_file_info = db.mark_file_invalid(file_id) if changed and post_mattermost_hint: mattermost_communication.report_newly_found_but_invalid_file( file_id, new_file_info ) return db._load_db() def sync_files_to_gitea(): repo, api_handler = sync.init_sync() for file_id, file_info in db.get_files().items(): if file_info["valid"]: local_file_path = file_info["local_file_path"] sync_to = file_info["metadata"]["sync-to"] try: sync.sync_file(repo, api_handler, local_file_path, sync_to) except Exception as e: print("Critical error: Failed to sync file to Gitea") traceback.print_exc() @hug.get("/sync-mattermost") def sync_mattermost(): print() print("=======================================") print("== Finding urls posted on mattermost ==") find_codimd_files_on_mattermost() print() print("=============================") print("== Downloading found files ==") download_files() print() print("================================================") print("== Finding valid files in the downloaded ones ==") validate_downloaded_files(post_mattermost_hint=True) print() print("================================================") print("== Syncing files to git ==") sync_files_to_gitea() print() # return db._load_db()