From 9cb71f18c2909924aa363f90d89d21ed6f8bf13e Mon Sep 17 00:00:00 2001 From: Thomas Peetz Date: Sun, 31 May 2026 00:18:40 +0200 Subject: [PATCH] check for duplicate links --- kontor-scripts/check_kontor.py | 206 ++++++++++----------------------- kontor-scripts/sync.py | 2 +- 2 files changed, 63 insertions(+), 145 deletions(-) diff --git a/kontor-scripts/check_kontor.py b/kontor-scripts/check_kontor.py index e75a543..4283227 100644 --- a/kontor-scripts/check_kontor.py +++ b/kontor-scripts/check_kontor.py @@ -1,22 +1,22 @@ """ Checks the database kontor """ +from dataclasses import dataclass from enum import Enum, auto -import json -import mariadb -import requests -from pathlib import Path +from logging import Logger +from typing import Dict, List, Optional from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter +from urllib.parse import urlparse + +from api import Option, OptionType, Server, get_api_config, get_logger -from config import get_logger, get_database_cursors parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) -parser.add_argument('--verbose', '-v', action='count', default=0) -parser.add_argument('--config', '-c', default='kontor') -parser.add_argument('--file', '-f') -parser.add_argument('--dir', '-d') -parser.add_argument('--dry-run', '-m', action='store_true') -parser.add_argument('--reset-cloud-link', '-r', action='store_true') +parser.add_argument("--verbose", "-v", action="count", default=0) +parser.add_argument("--config", "-c", default="kontor-api") +parser.add_argument("--dir", "-d", default="/data/media") +parser.add_argument("--dry-run", "-m", action="store_true") +parser.add_argument("--server", "-s") args = parser.parse_args() class StatusType(Enum): @@ -36,142 +36,61 @@ class FileStatus: self.id = response['id'] -def get_status_of_file(found_file: Path, cursor, log) -> FileStatus: - status = FileStatus() - try: - cursor.execute(f'SELECT id, cloud_link FROM media_file WHERE file_name="{found_file.name}"') - rows = cursor.fetchall() - if len(rows) == 1: - status.status_type = StatusType.FILE_NAME - status.id = rows[0][0] - except mariadb.Error as error: - log.debug(f'select failed with {error}') - try: - cursor.execute(f'SELECT id FROM media_file WHERE id="{found_file.stem}"') - rows = cursor.fetchall() - if len(rows) == 1: - status.status_type = StatusType.FILE_ID - status.id = rows[0][0] - if len(rows) > 1: - status.status_type = StatusType.DUPLICATE - for row in rows: - log.info(f"found {row[0]} with {found_file}") - except mariadb.Error as error: - log.debug(f'select failed with {error}') - try: - cursor.execute(f'SELECT id FROM media_file WHERE cloud_link LIKE "%{found_file.stem}%"') - rows = cursor.fetchall() - if len(rows) == 1: - status.id = rows[0][0] - if rows[0][0] == found_file.stem: - status.status_type = StatusType.CLOUD_LINK_ID - else: - status.status_type = StatusType.CLOUD_LINK - except mariadb.Error as error: - log.debug(f'select failed with {error}') - response = requests.get(f"http://127.0.0.1:8800/media/files/{found_file.stem}") - log.debug(f"Status: {response.status_code}") - if response.status_code == 200: - status.status_type = StatusType.FILE_ID - status.id = response.json()['id'] - return status +def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict]: + """ + create dictionary with id as key and dictionary as value. + """ + item_id_mapping: Dict[str, dict] = {} + for data_item in data_list: + log.debug(data_item) + item_id_mapping[data_item["id"]] = data_item + return item_id_mapping -def rename_files_to_id(media_dir, dry_run, conn, log): - media_path = Path(media_dir) - cursor = conn.cursor() - for file in media_path.iterdir(): - log.debug('found file: {}'.format(file.name)) - status: FileStatus = get_status_of_file(file, cursor, log) - file_id = status.id - if not file_id: - log.info(f"ID of file {file.name} is unknown") + +def check_duplicate_links(log: Logger, server: Server): + data = server.request(log=logger, table="media_file") + mapping = create_item_id_mapping(log=log, data_list=data) + visited_link_path: Dict[str, str] = {} + duplicate_link_paths: Dict[str, List[str]] = {} + for item in data: + link = item["url"] + if len(link) == 0: continue - new_file_path = file.with_name(f"{file_id}{file.suffix}") - match status.status_type: - case StatusType.FILE_NAME: - log.info(f'status of {file.name} is file_name') - rename_file(file, new_file_path, dry_run, log) - update_cloud_link(file_id, new_file_path, conn, dry_run, log) - case StatusType.FILE_ID: - log.info(f'status of {file.name} is file_id') - update_cloud_link(file_id, new_file_path, conn, dry_run, log) - case StatusType.CLOUD_LINK: - log.info(f'status of {file.name} is cloud_link') - rename_file(file, new_file_path, dry_run, log) - update_cloud_link(file_id, new_file_path, conn, dry_run, log) - case StatusType.CLOUD_LINK_ID: - log.debug(f'status of {file.name} is cloud_link_id') - update_cloud_link(file_id, new_file_path, conn, dry_run, log) - case StatusType.DUPLICATE: - log.info(f'status of {file.name} is duplicate') - case StatusType.UNKNOWN: - log.info(f'status of {file.name} is unknown') - -def rename_file(current_file, new_file_path, dry_run, log): - if dry_run: - log.info('rename file {} to {}'.format(current_file.name, new_file_path.name)) - else: - current_file.rename(Path(new_file_path)) - -def update_cloud_link(file_id, file_path, conn, dry_run, log): - cursor = conn.cursor() - log.debug(f'update entry {file_id} with {file_path.absolute()}') - if dry_run: - log.debug(f'UPDATE media_file: cloud_link={file_path.absolute()}') - else: - cursor.execute('UPDATE media_file SET cloud_link="{}" WHERE id="{}"'.format(file_path.absolute(), file_id)) - conn.commit() - -def reset_cloud_link(conn, dry_run, log): - cursor = conn.cursor() - if dry_run: - log.info('UPDATE media_file SET cloud_link=""') - else: - cursor.execute('UPDATE media_file SET cloud_link="" WHERE id is NOT NULL') - conn.commit() - -def check_file_with_db(data_file: Path, m_conn, log): - log.info(f"read json file: {data_file}") - cursor = m_conn.cursor() - with open(data_file, 'r') as json_file: - json_load = json.load(json_file) - for table in json_load: - log.info(f"{table}: {len(json_load[table])}") - items = json_load[table] - for item in items: - item_id = item['id'] - select_statement = f"SELECT * FROM {table} WHERE id='{item_id}'" - cursor.execute(select_statement) - rows = cursor.fetchall() - count = len(rows) - log.info(f"{count} entries found for {item_id}") - if count == 0: - log.info(f"entry for {item_id} not found") - if count == 1: - log.info(f"check entry {item_id}") - #log.info(f"entry {rows[0]}") - columns = [] - values = [] - for (key, value) in item.items(): - columns.append(key) - values.append(value) - for index, _ in enumerate(columns): - log.info(f"compare {values[index]} with {rows[0][index]}") - + file_id = item["id"] + parsed_url = urlparse(link) + link_path = parsed_url.path + if link_path in visited_link_path: + log.info("duplicate url path found: %s", link_path) + if link_path in duplicate_link_paths: + duplicate_link_paths[link_path].append(file_id) + else: + duplicate_link_paths[link_path] = [] + duplicate_link_paths[link_path].append(visited_link_path[link_path]) + duplicate_link_paths[link_path].append(file_id) + else: + visited_link_path[link_path] = file_id + log.info("found %s duplicate links", len(duplicate_link_paths.keys())) + deletion_list: List[str] = [] + for key, value in duplicate_link_paths.items(): + if len(value) == 2: + log.info("%s:\n%s - %s\n%s - %s", key, value[0], mapping[value[0]]["url"], value[1], mapping[value[1]]["url"]) + if mapping[value[0]]["url"].startswith("https://xhamster"): + deletion_list.append(value[0]) + else: + deletion_list.append(value[1]) + else: + log.info("found %s links", len(value)) + for key in deletion_list: + log.info("%s - %s", key, mapping[key]["url"]) if __name__ == '__main__': - log = get_logger(args.verbose, args.config) - log.info("kontor.check_kontor started") - _, m_conn = get_database_cursors(log, args.config) - if args.dir: - log.info("kontor.check_kontor.rename_files_to_id") - rename_files_to_id(args.dir, args.dry_run, m_conn, log) - if args.file: - data_file = Path(args.file) - if data_file.exists(): - log.info("kontor.check_kontor.check_file_with_db") - check_file_with_db(data_file, m_conn, log) + logger = get_logger(args.verbose, args.config) + logger.info("kontor.check_kontor started") + APICONFIG = get_api_config(logger, args.config) + server: Server = APICONFIG.server[0] + logger.info("kontor.check_kontor.check_duplicate_links") + check_duplicate_links(logger, server) #logger.info("kontor.check_kontor.update_cloud_link_with_found_files") #update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run) #logger.info("kontor.check_kontor.get_ids_from_column_cloud_link") @@ -179,5 +98,4 @@ if __name__ == '__main__': #logger.info('found {} ids in column cloud_link'.format(len(link_list))) #logger.info("kontor.check_kontor.checking_ids_from_cloud_link") #checking_ids_from_cloud_link(link_list, mariadb_cursor) - log.info("kontor.check_kontor finished") - + logger.info("kontor.check_kontor finished") diff --git a/kontor-scripts/sync.py b/kontor-scripts/sync.py index 8e71906..c8b94bc 100644 --- a/kontor-scripts/sync.py +++ b/kontor-scripts/sync.py @@ -31,7 +31,7 @@ def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict """ item_id_mapping: Dict[str, dict] = {} for data_item in data_list: - log.debug(data_item) + log.debug(data_item) item_id_mapping[data_item["id"]] = data_item return item_id_mapping