2022-10-06 22:44:04 +02:00
|
|
|
import json
|
|
|
|
import os
|
|
|
|
import re
|
|
|
|
import time
|
2022-10-07 13:22:07 +02:00
|
|
|
from pprint import pprint
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
import hug
|
|
|
|
import requests
|
|
|
|
|
|
|
|
import db
|
|
|
|
import dir_utils
|
2022-10-07 13:29:18 +02:00
|
|
|
import mattermost_client
|
|
|
|
import mattermost_communication
|
2022-10-07 14:03:13 +02:00
|
|
|
import sync
|
|
|
|
import gitlab
|
2022-10-07 13:22:07 +02:00
|
|
|
from utils import id_to_url, url_to_id
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
|
|
|
|
def find_codimd_files_on_mattermost():
|
2022-10-07 13:29:18 +02:00
|
|
|
mattermost = mattermost_client.MMApi()
|
2022-10-06 22:44:04 +02:00
|
|
|
channels = [
|
|
|
|
"hrx6pgfswjbttcj8nim3jrwe7w", # bestuur-INTERN
|
|
|
|
"uda7ax9poprduq8ob56e1fqk4e", # bestuur
|
|
|
|
]
|
|
|
|
|
|
|
|
last_fetch_time = db.get_latest_sync_time()
|
|
|
|
current_fetch_time = int(time.time() * 1000)
|
|
|
|
print(f"Fetching posts since: {last_fetch_time}")
|
|
|
|
for channel_id in channels:
|
|
|
|
print(f"Fetching posts for channel_id: {channel_id}")
|
|
|
|
|
|
|
|
# TODO Use first statement for all posts
|
|
|
|
posts = mattermost.get_posts_for_channel(channel_id, last_fetch_time)
|
|
|
|
# posts = mattermost.posts.get_posts_for_channel(channel_id)
|
|
|
|
|
|
|
|
print(f"Scraping {len(posts)} posts")
|
|
|
|
for post_id, post in posts.items():
|
2022-10-07 13:22:07 +02:00
|
|
|
# old: r"(https?://[^\s#?]+)"
|
|
|
|
url_regex = r"https?:\/\/(?:www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b(?:[-a-zA-Z0-9()@:%_\+.~#?&\/=]*)"
|
|
|
|
urls = re.findall(url_regex, post["message"])
|
2022-10-06 22:44:04 +02:00
|
|
|
for url in urls:
|
|
|
|
idx = url.find("codimd.zeus.gent")
|
|
|
|
if idx == -1:
|
|
|
|
# In this case it's an url but not for codimd
|
|
|
|
continue
|
|
|
|
|
2022-10-07 13:22:07 +02:00
|
|
|
# Remove everything after the # of ?
|
|
|
|
for char in ["#", "?"]:
|
|
|
|
cut_idx = url.rfind(char) if url.rfind(char) != -1 else len(url)
|
|
|
|
url = url[:cut_idx]
|
|
|
|
|
|
|
|
# pprint(post)
|
2022-10-06 22:44:04 +02:00
|
|
|
print(url)
|
2022-10-07 13:22:07 +02:00
|
|
|
db.add_discovered_url(url, post)
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
# When everything succeeded. Save the current unix time as latest fetched moment
|
|
|
|
db.set_latest_sync_time(current_fetch_time)
|
|
|
|
|
|
|
|
|
|
|
|
def read_note(url):
|
|
|
|
return requests.get(f"{url}/download").text
|
|
|
|
|
|
|
|
|
|
|
|
def download_files():
|
2022-10-07 13:22:07 +02:00
|
|
|
for file_id, file_info in db.get_files().items():
|
|
|
|
local_file_path = f"data/note-{file_id}.md"
|
|
|
|
url = file_info["source_url"]
|
|
|
|
with open(local_file_path, "w") as f:
|
2022-10-06 22:44:04 +02:00
|
|
|
print(f"Downloading url {url}")
|
|
|
|
f.write(read_note(url))
|
2022-10-07 13:22:07 +02:00
|
|
|
db.set_local_file_path(file_id, local_file_path)
|
|
|
|
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
def validate_downloaded_files():
|
|
|
|
path = "data"
|
|
|
|
dir_list = os.listdir(path)
|
|
|
|
|
2022-10-07 13:22:07 +02:00
|
|
|
for file_id, file_info in db.get_files().items():
|
|
|
|
file_path = file_info["local_file_path"]
|
|
|
|
metadata = dir_utils.find_metadata(file_path)
|
2022-10-06 22:44:04 +02:00
|
|
|
if metadata is not None:
|
2022-10-07 13:22:07 +02:00
|
|
|
is_new_file, new_file_info = db.mark_file_valid(file_id, metadata)
|
2022-10-06 23:07:26 +02:00
|
|
|
if is_new_file:
|
2022-10-07 13:29:18 +02:00
|
|
|
mattermost_communication.report_newly_found_file(file_id, new_file_info)
|
2022-10-07 13:22:07 +02:00
|
|
|
else:
|
|
|
|
changed, new_file_info = db.mark_file_invalid(file_id)
|
|
|
|
if changed:
|
2022-10-07 13:29:18 +02:00
|
|
|
mattermost_communication.report_newly_found_but_invalid_file(
|
|
|
|
file_id, new_file_info
|
|
|
|
)
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
return db._load_db()
|
|
|
|
|
2022-10-07 14:03:13 +02:00
|
|
|
def sync_files_to_gitlab():
|
|
|
|
repo, drive = sync.init_sync()
|
|
|
|
for file_id, file_info in db.get_files().items():
|
|
|
|
if file_info["valid"]:
|
|
|
|
local_file_path = file_info["local_file_path"]
|
|
|
|
sync_to = file_info["metadata"]["sync-to"]
|
|
|
|
sync.sync_file(drive, repo, local_file_path, sync_to)
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
@hug.get("/sync-mattermost")
|
|
|
|
def sync_mattermost():
|
|
|
|
print()
|
|
|
|
print("=======================================")
|
|
|
|
print("== Finding urls posted on mattermost ==")
|
|
|
|
find_codimd_files_on_mattermost()
|
|
|
|
print()
|
|
|
|
print("=============================")
|
|
|
|
print("== Downloading found files ==")
|
|
|
|
download_files()
|
|
|
|
print()
|
|
|
|
print("================================================")
|
|
|
|
print("== Finding valid files in the downloaded ones ==")
|
|
|
|
validate_downloaded_files()
|
|
|
|
print()
|
2022-10-07 14:03:13 +02:00
|
|
|
print("================================================")
|
|
|
|
print("== Syncing files to gitlab ==")
|
|
|
|
sync_files_to_gitlab()
|
|
|
|
print()
|
2022-10-06 22:44:04 +02:00
|
|
|
|
|
|
|
return db._load_db()
|