From 9cb71f18c2909924aa363f90d89d21ed6f8bf13e Mon Sep 17 00:00:00 2001
From: Thomas Peetz <thomas.peetz@thpeetz.de>
Date: Sun, 31 May 2026 00:18:40 +0200
Subject: [PATCH] check for duplicate links

---
 kontor-scripts/check_kontor.py | 206 ++++++++++-----------------------
 kontor-scripts/sync.py         |   2 +-
 2 files changed, 63 insertions(+), 145 deletions(-)

diff --git a/kontor-scripts/check_kontor.py b/kontor-scripts/check_kontor.py
index e75a543..4283227 100644
--- a/kontor-scripts/check_kontor.py
+++ b/kontor-scripts/check_kontor.py
@@ -1,22 +1,22 @@
 """
 Checks the database kontor
 """
+from dataclasses import dataclass
 from enum import Enum, auto
-import json
-import mariadb
-import requests
-from pathlib import Path
+from logging import Logger
+from typing import Dict, List, Optional
 from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
+from urllib.parse import urlparse
+
+from api import Option, OptionType, Server, get_api_config, get_logger
 
-from config import get_logger, get_database_cursors
 
 parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
-parser.add_argument('--verbose', '-v', action='count', default=0)
-parser.add_argument('--config', '-c', default='kontor')
-parser.add_argument('--file', '-f')
-parser.add_argument('--dir', '-d')
-parser.add_argument('--dry-run', '-m', action='store_true')
-parser.add_argument('--reset-cloud-link', '-r', action='store_true')
+parser.add_argument("--verbose", "-v", action="count", default=0)
+parser.add_argument("--config", "-c", default="kontor-api")
+parser.add_argument("--dir", "-d", default="/data/media")
+parser.add_argument("--dry-run", "-m", action="store_true")
+parser.add_argument("--server", "-s")
 args = parser.parse_args()
 
 class StatusType(Enum):
@@ -36,142 +36,61 @@ class FileStatus:
         self.id = response['id']
 
 
-def get_status_of_file(found_file: Path, cursor, log) -> FileStatus:
-    status = FileStatus()
-    try:
-        cursor.execute(f'SELECT id, cloud_link FROM media_file WHERE file_name="{found_file.name}"')
-        rows = cursor.fetchall()
-        if len(rows) == 1:
-            status.status_type = StatusType.FILE_NAME
-            status.id = rows[0][0]
-    except mariadb.Error as error:
-        log.debug(f'select failed with {error}')
-    try:
-        cursor.execute(f'SELECT id FROM media_file WHERE id="{found_file.stem}"')
-        rows = cursor.fetchall()
-        if len(rows) == 1:
-            status.status_type = StatusType.FILE_ID
-            status.id = rows[0][0]
-        if len(rows) > 1:
-            status.status_type = StatusType.DUPLICATE
-            for row in rows:
-                log.info(f"found {row[0]} with {found_file}")
-    except mariadb.Error as error:
-        log.debug(f'select failed with {error}')
-    try:
-        cursor.execute(f'SELECT id FROM media_file WHERE cloud_link LIKE "%{found_file.stem}%"')
-        rows = cursor.fetchall()
-        if len(rows) == 1:
-            status.id = rows[0][0]
-            if rows[0][0] == found_file.stem:
-                status.status_type = StatusType.CLOUD_LINK_ID
-            else:
-                status.status_type = StatusType.CLOUD_LINK
-    except mariadb.Error as error:
-        log.debug(f'select failed with {error}')
-    response = requests.get(f"http://127.0.0.1:8800/media/files/{found_file.stem}")
-    log.debug(f"Status: {response.status_code}")
-    if response.status_code == 200:
-        status.status_type = StatusType.FILE_ID
-        status.id = response.json()['id']
-    return status
+def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict]:
+    """
+    create dictionary with id as key and dictionary as value.
+    """
+    item_id_mapping: Dict[str, dict] = {}
+    for data_item in data_list:
+        log.debug(data_item)
+        item_id_mapping[data_item["id"]] = data_item
+    return item_id_mapping
 
-def rename_files_to_id(media_dir, dry_run, conn, log):
-    media_path = Path(media_dir)
-    cursor = conn.cursor()
-    for file in media_path.iterdir():
-        log.debug('found file: {}'.format(file.name))
-        status: FileStatus = get_status_of_file(file, cursor, log)
-        file_id = status.id
-        if not file_id:
-            log.info(f"ID of file {file.name} is unknown")
+
+def check_duplicate_links(log: Logger, server: Server):
+    data = server.request(log=logger, table="media_file")
+    mapping = create_item_id_mapping(log=log, data_list=data)
+    visited_link_path: Dict[str, str] = {}
+    duplicate_link_paths: Dict[str, List[str]] = {}
+    for item in data:
+        link = item["url"]
+        if len(link) == 0:
             continue
-        new_file_path = file.with_name(f"{file_id}{file.suffix}")
-        match status.status_type:
-            case StatusType.FILE_NAME:
-                log.info(f'status of {file.name} is file_name')
-                rename_file(file, new_file_path, dry_run, log)
-                update_cloud_link(file_id, new_file_path, conn, dry_run, log)
-            case StatusType.FILE_ID:
-                log.info(f'status of {file.name} is file_id')
-                update_cloud_link(file_id, new_file_path, conn, dry_run, log)
-            case StatusType.CLOUD_LINK:
-                log.info(f'status of {file.name} is cloud_link')
-                rename_file(file, new_file_path, dry_run, log)
-                update_cloud_link(file_id, new_file_path, conn, dry_run, log)
-            case StatusType.CLOUD_LINK_ID:
-                log.debug(f'status of {file.name} is cloud_link_id')
-                update_cloud_link(file_id, new_file_path, conn, dry_run, log)
-            case StatusType.DUPLICATE:
-                log.info(f'status of {file.name} is duplicate')
-            case StatusType.UNKNOWN:
-                log.info(f'status of {file.name} is unknown')
-
-def rename_file(current_file, new_file_path, dry_run, log):
-    if dry_run:
-        log.info('rename file {} to {}'.format(current_file.name, new_file_path.name))
-    else:
-        current_file.rename(Path(new_file_path))
-
-def update_cloud_link(file_id, file_path, conn, dry_run, log):
-    cursor = conn.cursor()
-    log.debug(f'update entry {file_id} with {file_path.absolute()}')
-    if dry_run:
-        log.debug(f'UPDATE media_file: cloud_link={file_path.absolute()}')
-    else:
-        cursor.execute('UPDATE media_file SET cloud_link="{}" WHERE id="{}"'.format(file_path.absolute(), file_id))
-        conn.commit()
-
-def reset_cloud_link(conn, dry_run, log):
-    cursor = conn.cursor()
-    if dry_run:
-        log.info('UPDATE media_file SET cloud_link=""')
-    else:
-        cursor.execute('UPDATE media_file SET cloud_link="" WHERE id is NOT NULL')
-        conn.commit()
-
-def check_file_with_db(data_file: Path, m_conn, log):
-    log.info(f"read json file: {data_file}")
-    cursor = m_conn.cursor()
-    with open(data_file, 'r') as json_file:
-        json_load = json.load(json_file)
-        for table in json_load:
-            log.info(f"{table}: {len(json_load[table])}")
-            items = json_load[table]
-            for item in items:
-                item_id = item['id']
-                select_statement = f"SELECT * FROM {table} WHERE id='{item_id}'"
-                cursor.execute(select_statement)
-                rows = cursor.fetchall()
-                count = len(rows)
-                log.info(f"{count} entries found for {item_id}")
-                if count == 0:
-                    log.info(f"entry for {item_id} not found")
-                if count == 1:
-                    log.info(f"check entry {item_id}")
-                    #log.info(f"entry {rows[0]}")
-                    columns = []
-                    values = []
-                    for (key, value) in item.items():
-                        columns.append(key)
-                        values.append(value)
-                    for index, _ in enumerate(columns):
-                        log.info(f"compare {values[index]} with {rows[0][index]}")
-
+        file_id = item["id"]
+        parsed_url = urlparse(link)
+        link_path = parsed_url.path
+        if link_path in visited_link_path:
+            log.info("duplicate url path found: %s", link_path)
+            if link_path in duplicate_link_paths:
+                duplicate_link_paths[link_path].append(file_id)
+            else:
+                duplicate_link_paths[link_path] = []
+                duplicate_link_paths[link_path].append(visited_link_path[link_path])
+                duplicate_link_paths[link_path].append(file_id)
+        else:
+            visited_link_path[link_path] = file_id
+    log.info("found %s duplicate links", len(duplicate_link_paths.keys()))
+    deletion_list: List[str] = []
+    for key, value in duplicate_link_paths.items():
+        if len(value) == 2:
+            log.info("%s:\n%s - %s\n%s - %s", key, value[0], mapping[value[0]]["url"], value[1], mapping[value[1]]["url"])
+            if mapping[value[0]]["url"].startswith("https://xhamster"):
+                deletion_list.append(value[0])
+            else:
+                deletion_list.append(value[1])
+        else:
+            log.info("found %s links", len(value))
+    for key in deletion_list:
+        log.info("%s - %s", key, mapping[key]["url"])
 
 
 if __name__ == '__main__':
-    log = get_logger(args.verbose, args.config)
-    log.info("kontor.check_kontor started")
-    _, m_conn = get_database_cursors(log, args.config)
-    if args.dir:
-        log.info("kontor.check_kontor.rename_files_to_id")
-        rename_files_to_id(args.dir, args.dry_run, m_conn, log)
-    if args.file:
-        data_file = Path(args.file)
-        if data_file.exists():
-            log.info("kontor.check_kontor.check_file_with_db")
-            check_file_with_db(data_file, m_conn, log)
+    logger = get_logger(args.verbose, args.config)
+    logger.info("kontor.check_kontor started")
+    APICONFIG = get_api_config(logger, args.config)
+    server: Server = APICONFIG.server[0]
+    logger.info("kontor.check_kontor.check_duplicate_links")
+    check_duplicate_links(logger, server)
     #logger.info("kontor.check_kontor.update_cloud_link_with_found_files")
     #update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run)
     #logger.info("kontor.check_kontor.get_ids_from_column_cloud_link")
@@ -179,5 +98,4 @@ if __name__ == '__main__':
     #logger.info('found {} ids in column cloud_link'.format(len(link_list)))
     #logger.info("kontor.check_kontor.checking_ids_from_cloud_link")
     #checking_ids_from_cloud_link(link_list, mariadb_cursor)
-    log.info("kontor.check_kontor finished")
-
+    logger.info("kontor.check_kontor finished")
diff --git a/kontor-scripts/sync.py b/kontor-scripts/sync.py
index 8e71906..c8b94bc 100644
--- a/kontor-scripts/sync.py
+++ b/kontor-scripts/sync.py
@@ -31,7 +31,7 @@ def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict
     """
     item_id_mapping: Dict[str, dict] = {}
     for data_item in data_list:
-        log.debug(data_item)        
+        log.debug(data_item)
         item_id_mapping[data_item["id"]] = data_item
     return item_id_mapping