codimd-git-sync/src/web.py
2024-03-27 18:07:34 +01:00

127 lines
4.2 KiB
Python

import json
import os
import re
import time
import traceback
from pprint import pprint
import hug
import requests
import db
import dir_utils
import mattermost_client
import mattermost_communication
import sync_gitea as sync
from config import config
from utils import id_to_url, url_to_id
def find_codimd_files_on_mattermost():
mattermost = mattermost_client.MMApi()
channels = config["mattermost"]["scrape_channel_ids"]
last_fetch_time = db.get_latest_sync_time()
current_fetch_time = int(time.time() * 1000)
print(f"Fetching posts since: {last_fetch_time}")
for channel_id in channels:
print(f"Fetching posts for channel_id: {channel_id}")
# TODO Use first statement for all posts
posts = mattermost.get_posts_for_channel(channel_id, last_fetch_time)
# posts = mattermost.posts.get_posts_for_channel(channel_id)
print(f"Scraping {len(posts)} posts")
for post_id, post in posts.items():
# old: r"(https?://[^\s#?]+)"
url_regex = r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)"
urls = re.findall(url_regex, post["message"])
for url in urls:
idx = url.find("codimd.zeus.gent")
if idx == -1:
# In this case it's an url but not for codimd
continue
# Remove everything after the # of ?
for char in ["#", "?"]:
cut_idx = url.rfind(char) if url.rfind(char) != -1 else len(url)
url = url[:cut_idx]
# pprint(post)
print(url)
db.add_discovered_url(url, post)
# When everything succeeded. Save the current unix time as latest fetched moment
db.set_latest_sync_time(current_fetch_time)
def read_note(url):
return requests.get(f"{url}/download").text
def download_files():
for file_id, file_info in db.get_files().items():
local_file_path = f"data/note-{file_id}.md"
url = file_info["source_url"]
with open(local_file_path, "w") as f:
print(f"Downloading url {url}")
f.write(read_note(url))
db.set_local_file_path(file_id, local_file_path)
def validate_downloaded_files(post_mattermost_hint=True):
path = "data"
dir_list = os.listdir(path)
for file_id, file_info in db.get_files().items():
file_path = file_info["local_file_path"]
metadata = dir_utils.find_metadata(file_path)
if metadata is not None:
is_new_file, new_file_info = db.mark_file_valid(file_id, metadata)
if is_new_file:
mattermost_communication.report_newly_found_file(file_id, new_file_info)
else:
changed, new_file_info = db.mark_file_invalid(file_id)
if changed and post_mattermost_hint:
mattermost_communication.report_newly_found_but_invalid_file(
file_id, new_file_info
)
return db._load_db()
def sync_files_to_gitea():
repo, api_handler = sync.init_sync()
for file_id, file_info in db.get_files().items():
if file_info["valid"]:
local_file_path = file_info["local_file_path"]
sync_to = file_info["metadata"]["sync-to"]
try:
sync.sync_file(repo, api_handler, local_file_path, sync_to)
except Exception as e:
print("Critical error: Failed to sync file to Gitea")
traceback.print_exc()
@hug.get("/sync-mattermost")
def sync_mattermost():
print()
print("=======================================")
print("== Finding urls posted on mattermost ==")
find_codimd_files_on_mattermost()
print()
print("=============================")
print("== Downloading found files ==")
download_files()
print()
print("================================================")
print("== Finding valid files in the downloaded ones ==")
validate_downloaded_files(post_mattermost_hint=True)
print()
print("================================================")
print("== Syncing files to git ==")
sync_files_to_gitea()
print()
# return db._load_db()