From 1b18dae31157856aee1022521c3bb7d463baa31e Mon Sep 17 00:00:00 2001 From: Thomas Peetz Date: Tue, 1 Apr 2025 08:12:53 +0200 Subject: [PATCH] add scripts from repository python-scripts --- scripts/check_kontor.py | 135 +++++++++++++++++++++++++++++++++++++ scripts/copy_to_mariadb.py | 52 ++++++++++++++ scripts/copy_to_sqlite.py | 51 ++++++++++++++ scripts/db_structure.py | 67 ++++++++++++++++++ scripts/download.py | 73 ++++++++++++++++++++ scripts/read_list.py | 57 ++++++++++++++++ scripts/setup.py | 116 +++++++++++++++++++++++++++++++ scripts/update_title.py | 48 +++++++++++++ 8 files changed, 599 insertions(+) create mode 100644 scripts/check_kontor.py create mode 100644 scripts/copy_to_mariadb.py create mode 100644 scripts/copy_to_sqlite.py create mode 100644 scripts/db_structure.py create mode 100644 scripts/download.py create mode 100644 scripts/read_list.py create mode 100644 scripts/setup.py create mode 100644 scripts/update_title.py diff --git a/scripts/check_kontor.py b/scripts/check_kontor.py new file mode 100644 index 0000000..8e350cb --- /dev/null +++ b/scripts/check_kontor.py @@ -0,0 +1,135 @@ +""" +Checks the database kontor +""" +from enum import Enum, auto + +import mariadb +from pathlib import Path +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from setup import get_database_cursors, get_logger + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--verbose', '-v', action='count', default=0) +parser.add_argument('--dir', '-d', default='/media/tpeetz/Media') +parser.add_argument('--dry-run', '-m', action='store_true') +parser.add_argument('--reset-cloud-link', '-r', action='store_true') +args = parser.parse_args() + +class StatusType(Enum): + UNKNOWN = auto() + FILE_NAME = auto() + FILE_ID = auto() + DUPLICATE = auto() + CLOUD_LINK = auto() + CLOUD_LINK_ID = auto() + + +def get_status_of_file(found_file, cursor): + status = StatusType.UNKNOWN + file_id = '' + try: + cursor.execute(f'SELECT id, cloud_link FROM media_file WHERE file_name="{found_file.name}"') + rows = cursor.fetchall() + if len(rows) == 1: + status = StatusType.FILE_NAME + file_id = rows[0][0] + except mariadb.Error as error: + logger.debug(f'select failed with {error}') + try: + cursor.execute(f'SELECT id FROM media_file WHERE id="{found_file.stem}"') + rows = cursor.fetchall() + if len(rows) == 1: + status = StatusType.FILE_ID + file_id = rows[0][0] + if len(rows) > 1: + status = StatusType.DUPLICATE + for row in rows: + logger.info(f"found {row[0]} with {found_file}") + except mariadb.Error as error: + logger.debug(f'select failed with {error}') + try: + cursor.execute(f'SELECT id FROM media_file WHERE cloud_link LIKE "%{found_file.stem}%"') + rows = cursor.fetchall() + if len(rows) == 1: + file_id = rows[0][0] + if rows[0][0] == found_file.stem: + status = StatusType.CLOUD_LINK_ID + else: + status = StatusType.CLOUD_LINK + except mariadb.Error as error: + logger.debug(f'select failed with {error}') + return status, file_id + +def rename_files_to_id(media_dir, conn, dry_run): + media_path = Path(media_dir) + cursor = conn.cursor() + for file in media_path.iterdir(): + logger.debug('found file: {}'.format(file.name)) + (status, file_id) = get_status_of_file(file, cursor) + new_file_path = file.with_name(f"{file_id}{file.suffix}") + match status: + case StatusType.FILE_NAME: + logger.info(f'status of {file.name} is file_name') + rename_file(file, new_file_path, dry_run) + update_cloud_link(file_id, new_file_path, conn, dry_run) + case StatusType.FILE_ID: + logger.info(f'status of {file.name} is file_id') + update_cloud_link(file_id, new_file_path, conn, dry_run) + case StatusType.CLOUD_LINK: + logger.info(f'status of {file.name} is cloud_link') + rename_file(file, new_file_path, dry_run) + update_cloud_link(file_id, new_file_path, conn, dry_run) + case StatusType.CLOUD_LINK_ID: + logger.debug(f'status of {file.name} is cloud_link_id') + update_cloud_link(file_id, new_file_path, conn, dry_run) + case StatusType.DUPLICATE: + logger.info(f'status of {file.name} is duplicate') + case StatusType.UNKNOWN: + logger.info(f'status of {file.name} is unknown') + case _: + logger.info(f'status of {file.name} is not defined') + +def rename_file(current_file, new_file_path, dry_run): + if dry_run: + logger.info('rename file {} to {}'.format(current_file.name, new_file_path.name)) + else: + current_file.rename(Path(new_file_path)) + +def update_cloud_link(file_id, file_path, conn, dry_run): + cursor = conn.cursor() + logger.debug(f'update entry {file_id} with {file_path.absolute()}') + if dry_run: + logger.info(f'UPDATE media_file: cloud_link={file_path.absolute()}') + else: + cursor.execute('UPDATE media_file SET cloud_link="{}" WHERE id="{}"'.format(file_path.absolute(), file_id)) + conn.commit() + +def reset_cloud_link(conn, dry_run): + cursor = conn.cursor() + if dry_run: + logger.info('UPDATE media_file SET cloud_link=""') + else: + cursor.execute('UPDATE media_file SET cloud_link="" WHERE id is NOT NULL') + conn.commit() + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info("kontor.check_kontor started") + _, mariadb_conn = get_database_cursors(logger) + mariadb_cursor = mariadb_conn.cursor() + if args.reset_cloud_link: + reset_cloud_link(mariadb_conn, args.dry_run) + link_list = [] + data_dir = args.dir + logger.info("kontor.check_kontor.rename_files_to_id") + rename_files_to_id(data_dir, mariadb_conn, args.dry_run) + #logger.info("kontor.check_kontor.update_cloud_link_with_found_files") + #update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run) + #logger.info("kontor.check_kontor.get_ids_from_column_cloud_link") + #get_ids_from_column_cloud_link(link_list, mariadb_cursor) + #logger.info('found {} ids in column cloud_link'.format(len(link_list))) + #logger.info("kontor.check_kontor.checking_ids_from_cloud_link") + #checking_ids_from_cloud_link(link_list, mariadb_cursor) + mariadb_conn.close() + logger.info("kontor.check_kontor finished") diff --git a/scripts/copy_to_mariadb.py b/scripts/copy_to_mariadb.py new file mode 100644 index 0000000..bb029be --- /dev/null +++ b/scripts/copy_to_mariadb.py @@ -0,0 +1,52 @@ +""" +copy data from SQLite to MariaDB +""" +import sqlite3 +import mariadb +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from setup import get_database_cursors, create_tables, get_logger, get_scripts, get_meta_data + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--recreate-db', action='store_true') +parser.add_argument('--verbose', '-v', action='count', default=0) +args = parser.parse_args() + + +def copy_data(mariadb_conn, sqlite_conn, table_scripts): + mariadb_cursor = mariadb_conn.cursor() + sqlite_cursor = sqlite_conn.cursor() + # logger.info(table_scripts) + for table_id in scripts: + select_statement = scripts[table_id]['select_sqlite'] + # logger.info(select_statement) + insert_statement = scripts[table_id]['insert_mariadb'] + mariadb_cursor.execute("SET FOREIGN_KEY_CHECKS = 0") + mariadb_cursor.execute(scripts[table_id]['truncate_mariadb']) + try: + sqlite_cursor.execute(select_statement) + rows = sqlite_cursor.fetchall() + for row in rows: + try: + mariadb_cursor.execute(insert_statement, row) + except sqlite3.Error as error: + logger.info('insert failed with %s\n%s\n%s', error, insert_statement, row) + mariadb_conn.commit() + mariadb_cursor.execute(scripts[table_id]['count']) + (number_of_rows,) = mariadb_cursor.fetchone() + row = sqlite_cursor.execute(scripts[table_id]['count']).fetchone() + logger.info('%s contains %d : %d entries', scripts[table_id]['name'], number_of_rows, row[0]) + except sqlite3.Error as error: + logger.info('select failed with %s', error) + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info('kontor.copy_to_sqlite started') + s_conn, m_conn = get_database_cursors(logger) + meta_data_tables = get_meta_data(m_conn) + # logger.info(meta_data_tables) + scripts = get_scripts(meta_data_tables, logger) + copy_data(m_conn, s_conn, scripts) + s_conn.close() + m_conn.close() + logger.info('kontor.copy_to_sqlite finished') diff --git a/scripts/copy_to_sqlite.py b/scripts/copy_to_sqlite.py new file mode 100644 index 0000000..3544a96 --- /dev/null +++ b/scripts/copy_to_sqlite.py @@ -0,0 +1,51 @@ +""" +copy data from MariaDB to SQLite +""" +import sqlite3 +import mariadb +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from setup import get_database_cursors, create_tables, get_logger, get_meta_data, get_scripts + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--recreate-db', action='store_true') +parser.add_argument('--verbose', '-v', action='count', default=0) +args = parser.parse_args() + + +def copy_data(mariadb_conn, sqlite_conn, table_scripts): + mariadb_cursor = mariadb_conn.cursor() + sqlite_cursor = sqlite_conn.cursor() + # logger.info(table_scripts) + for table_id in table_scripts: + select_statement = scripts[table_id]['select_mariadb'] + # logger.info(select_statement) + insert_statement = scripts[table_id]['insert_sqlite'] + try: + mariadb_cursor.execute(select_statement) + rows = mariadb_cursor.fetchall() + for row in rows: + try: + sqlite_cursor.execute(insert_statement, row) + except sqlite3.Error as error: + logger.info('insert failed with %s\n%s\n%s', error, insert_statement, row) + sqlite_conn.commit() + mariadb_cursor.execute(scripts[table_id]['count']) + (number_of_rows,) = mariadb_cursor.fetchone() + row = sqlite_cursor.execute(scripts[table_id]['count']).fetchone() + logger.info('%s contains %d : %d entries', scripts[table_id]['name'], number_of_rows, row[0]) + except mariadb.Error as error: + logger.info('select failed with %s', error) + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info('kontor.copy_to_sqlite started') + s_conn, m_conn = get_database_cursors(logger) + meta_data_tables = get_meta_data(m_conn) + # logger.info(meta_data_tables) + scripts = get_scripts(meta_data_tables, logger) + create_tables(s_conn, logger, args.recreate_db, scripts) + copy_data(m_conn, s_conn, scripts) + s_conn.close() + m_conn.close() + logger.info('kontor.copy_to_sqlite finished') diff --git a/scripts/db_structure.py b/scripts/db_structure.py new file mode 100644 index 0000000..d363a52 --- /dev/null +++ b/scripts/db_structure.py @@ -0,0 +1,67 @@ +""" +Prints the database kontor structure +""" +import mariadb +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from setup import get_database_cursors, get_logger + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--verbose', '-v', action='count', default=0) +args = parser.parse_args() + + +def show_tables(cur, log): + """ + Retrieves the list of tables from the database + :param cur: + :param log: + :return: + """ + log.info('get list of tables') + table_list = [] + cur.execute("SHOW TABLES") + for (tablename,) in cur.fetchall(): + table_list.append(tablename) + return table_list + + +def get_field_info(cur): + """ + Retrieves the field info associated with a cursor + :param cur: + :return: + """ + field_info = mariadb.fieldinfo() + field_info_text_list = [] + for column in cur.description: + column_name = column[0] + column_type = field_info.type(column) + column_flags = field_info.flag(column) + field_info_text_list.append(f"{column_name}: {column_type} {column_flags}") + return field_info_text_list + + +def get_table_field_info(cur, tablename): + """ + Retrieves the field info associated with a table + :param cur: + :param tablename: + :return: + """ + cur.execute(f"SELECT * FROM {tablename} LIMIT 1") + field_info = get_field_info(cur) + return field_info + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info("kontor.db_structure started") + _, mariadb_conn = get_database_cursors(logger) + tables = show_tables(mariadb_conn.cursor(), logger) + for table in tables: + field_info_text = get_table_field_info(mariadb_conn.cursor(), table) + print(f"Columns in table {table}:") + print("\n".join(field_info_text)) + print("\n") + mariadb_conn.close() + logger.info("kontor.db_structure finished") diff --git a/scripts/download.py b/scripts/download.py new file mode 100644 index 0000000..787ff9d --- /dev/null +++ b/scripts/download.py @@ -0,0 +1,73 @@ +""" +download files with URLs from DB +""" +import re +import subprocess +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import mariadb +from setup import get_database_cursors, create_tables, get_logger + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--verbose', '-v', action='count', default=0) +parser.add_argument('--dir', '-d', default='/data/media') +parser.add_argument('--dry-run', '-m', action='store_true') +parser.add_argument('--rename', '-r', action='store_true') +args = parser.parse_args() + + +def parse_output(lines_list, log): + file_name = "" + for line in lines_list: + if 'has already been downloaded' in line: + end_len = len(' has already been downloaded') + file_name = line[11:-end_len] + log.info('found file: "%s"', file_name) + if 'Destination' in line: + line_len = len(line) + start_len = len('[download] Destination: ') + file_len = line_len-start_len + file_name = line[-file_len:] + log.info('new file: "%s"', file_name) + return file_name + + +def download_url(video_url, log): + result = subprocess.run(["/home/tpeetz/bin/yt-dlp", video_url], cwd=args.dir, capture_output=True, text=True) + if result.returncode == 0: + output = result.stdout + output = re.sub(' +', ' ', output) + lines_list = output.splitlines() + return parse_output(lines_list, log) + else: + return None + + +def download_and_update(link, entry_id, conn): + m_cursor = conn.cursor() + filename = download_url(link, logger) + if filename is None: + update_statement = 'UPDATE media_file set review = true WHERE id = ?' + logger.debug(f'entry {entry_id} could not downloaded, set to Review') + m_cursor.execute(update_statement, (entry_id,)) + else: + update_statement = 'UPDATE media_file set file_name = ?, should_download = false, review = false WHERE id = ?' + logger.debug(f'entry {entry_id} successfully downloaded, set review and should_download to false') + m_cursor.execute(update_statement, (filename, entry_id)) + conn.commit() + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info('kontor.download started') + s_conn, m_conn = get_database_cursors(logger) + cursor = m_conn.cursor() + cursor.execute('SELECT id, url FROM media_file where should_download is true') + for (link_id, url) in cursor.fetchall(): + if url is None: + logger.info('There is no url for id {}'.format(link_id)) + else: + if args.dry_run: + logger.info(f'download {url} for {link_id}') + else: + download_and_update(url, link_id, m_conn) + logger.info('kontor.download finished') diff --git a/scripts/read_list.py b/scripts/read_list.py new file mode 100644 index 0000000..cc1c4e8 --- /dev/null +++ b/scripts/read_list.py @@ -0,0 +1,57 @@ +""" +read file with URLs and store in DB +""" +import uuid +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import datetime + +import mariadb +from setup import get_database_cursors, get_logger, get_scripts, get_meta_data + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('-f', '--links', help='file with links') +parser.add_argument('--verbose', '-v', action='count', default=0) +args = parser.parse_args() + + +def read_links_file(links_file): + with open(links_file, 'r') as input_file: + lines = input_file.readlines() + return lines + + +def add_link_to_db(statement, connection, video_url, log): + entry_id = str(uuid.uuid4()) + current_date_time = datetime.datetime.now() + try: + cur = connection.cursor() + cur.execute(statement, (entry_id, current_date_time, current_date_time, 0, video_url, True, True, None, None, None, None)) + connection.commit() + log.info(f'link {video_url} added to db') + except mariadb.Error as insert_error: + log.debug("insert failed with %s", insert_error) + entry_id = None + return entry_id + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info('kontor.read_list started') + s_conn, m_conn = get_database_cursors(logger) + meta_data_tables = get_meta_data(m_conn) + scripts = get_scripts(meta_data_tables, logger) + tables = {} + for table_id in scripts: + tables[scripts[table_id]['name']] = table_id + media_file_id = tables['media_file'] + insert_statement = scripts[tables['media_file']]['insert_mariadb'] + if args.links: + logger.info("read links from file") + links = read_links_file(args.links) + for link in links: + logger.info("add link to db") + add_link_to_db(insert_statement, m_conn, link.strip(), logger) + else: + logger.info('script used: {}'.format(insert_statement)) + logger.info('kontor.read_list finished') + diff --git a/scripts/setup.py b/scripts/setup.py new file mode 100644 index 0000000..03342a6 --- /dev/null +++ b/scripts/setup.py @@ -0,0 +1,116 @@ +""" +Setup database connections +""" +import sqlite3 +import mariadb +import logging.config +from platformdirs import PlatformDirs +from pathlib import Path +import yaml + + +def get_database_cursors(log): + dirs = PlatformDirs("kontor") + database_config = Path(dirs.user_config_dir, 'database-config.yaml') + with open(database_config, 'rt') as f: + db_config = yaml.safe_load(f.read()) + sqlite_db = db_config["sqlite"]["file"] + log.info('using SQLite3 database {}'.format(sqlite_db)) + sqlite_conn = sqlite3.connect(sqlite_db, detect_types=sqlite3.PARSE_DECLTYPES | sqlite3.PARSE_COLNAMES) + mariadb_conn = mariadb.connect( + host=db_config['mariadb']['host'], + port=db_config['mariadb']['port'], + user=db_config['mariadb']['user'], + password=db_config['mariadb']['password'], + database=db_config['mariadb']['database'] + ) + return sqlite_conn, mariadb_conn + + +def create_tables(sqlite_conn, logger, recreate_db, scripts): + logger.info('create_tables') + for table_id in scripts: + create_statement = scripts[table_id]['create'] + drop_statement = scripts[table_id]['drop'] + logger.debug(create_statement) + cursor = sqlite_conn.cursor() + if recreate_db: + logger.debug(drop_statement) + cursor.execute(drop_statement) + cursor.execute(create_statement) + + +def get_logger(level): + dirs = PlatformDirs("kontor") + logging_config = Path(dirs.user_config_dir, 'logging-config.yaml') + with open(logging_config, 'rt') as f: + config = yaml.safe_load(f.read()) + logging.config.dictConfig(config) + logger = logging.getLogger('development') + if level is not None: + match level: + case 0: + logger.setLevel(logging.INFO) + case 1: + logger.setLevel(logging.DEBUG) + case _: + logger.setLevel(logging.CRITICAL) + return logger + + +def get_meta_data(mariadb_conn): + mariadb_cursor = mariadb_conn.cursor() + select_statement = "SELECT id, table_name FROM meta_data_table" + mariadb_cursor.execute(select_statement) + rows = mariadb_cursor.fetchall() + meta_data = {} + for (identifier, table_name) in rows: + table_data = {"name": table_name} + mariadb_cursor.execute("SELECT column_name, column_sync_name, column_type, column_modifier, column_order FROM meta_data_column WHERE table_id=?", (identifier, )) + column_rows = mariadb_cursor.fetchall() + column_list = [] + for (column_name, column_sync_name, column_type, column_modifier, column_order) in column_rows: + column_data = {"column_name": column_name, "column_sync_name": column_sync_name, "column_type": column_type, + "column_modifier": column_modifier, "column_order": column_order} + column_list.append(column_data) + # logger.info(column_list) + table_data["columns"] = column_list + meta_data[identifier] = table_data + return meta_data + + +def get_scripts(meta_data, logger): + scripts_map = {} + for table_id in meta_data: + table_scripts = {} + m_columns = [] + s_columns = [] + columns = [] + for column_data in meta_data[table_id]["columns"]: + column_line = "{} {}".format(column_data["column_sync_name"], column_data["column_type"]) + if column_data["column_modifier"]: + column_line += " " + column_data["column_modifier"] + columns.append(column_line) + m_columns.append(column_data['column_name']) + s_columns.append(column_data['column_sync_name']) + table_name = meta_data[table_id]["name"] + create_statement = "CREATE TABLE IF NOT EXISTS {} ({});".format(table_name, ", ".join(columns)) + drop_statement = 'DROP TABLE IF EXISTS {}'.format(table_name) + select_mariadb_statement = 'SELECT {} FROM {}'.format(', '.join(m_columns), table_name) + select_sqlite_statement = 'SELECT {} FROM {}'.format(', '.join(s_columns), table_name) + insert_sqlite_statement = 'INSERT INTO {}({}) VALUES({})'.format(table_name, ', '.join(s_columns), ', '.join(['?']*len(s_columns))) + insert_mariadb_statement = 'INSERT INTO {}({}) VALUES({})'.format(table_name, ', '.join(m_columns), ', '.join(['?']*len(m_columns))) + truncate_mariadb_statement = 'TRUNCATE {}'.format(table_name) + #logger.debug(create_statement) + #logger.debug(select_mariadb_statement) + table_scripts["create"] = create_statement + table_scripts["drop"] = drop_statement + table_scripts["select_mariadb"] = select_mariadb_statement + table_scripts["select_sqlite"] = select_sqlite_statement + table_scripts["insert_sqlite"] = insert_sqlite_statement + table_scripts["insert_mariadb"] = insert_mariadb_statement + table_scripts["truncate_mariadb"] = truncate_mariadb_statement + table_scripts["count"] = "SELECT COUNT(*) FROM {}".format(table_name) + table_scripts["name"] = table_name + scripts_map[table_id] = table_scripts + return scripts_map diff --git a/scripts/update_title.py b/scripts/update_title.py new file mode 100644 index 0000000..9acf8d2 --- /dev/null +++ b/scripts/update_title.py @@ -0,0 +1,48 @@ +""" +download files with URLs from DB +""" +import re +import subprocess +import datetime +import logging +import mariadb +import requests +from bs4 import BeautifulSoup +from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +import mariadb +from setup import get_database_cursors, create_tables, get_logger + +parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) +parser.add_argument('--verbose', '-v', action='count', default=0) +args = parser.parse_args() + + +if __name__ == '__main__': + logger = get_logger(args.verbose) + logger.info('kontor.download started') + s_conn, m_conn = get_database_cursors(logger) + cursor = m_conn.cursor() + cursor.execute('SELECT id, url FROM media_file where review is true') + for (link_id, url) in cursor.fetchall(): + if url is None: + logger.info('There is no url for id {}'.format(link_id)) + else: + logger.info('get title for url {}'.format(url)) + try: + r = requests.get(url) + soup = BeautifulSoup(r.content, "html.parser") + title = soup.title.string + except: + logger.info("Sorry, could not retrieve title") + update_statement = 'UPDATE media_file set review = true WHERE id = ?' + cursor.execute(update_statement, (link_id, )) + logger.info('ID {} has title {}'.format(link_id, title)) + update = 'UPDATE media_file SET title = ?, review= False where id= ?' + try: + cursor.execute(update, (title, link_id)) + logger.info('entry {} updated'.format(link_id)) + except mariadb.Error as error: + logger.info(error) + m_conn.commit() + logger.info('kontor.download finished') +