check for duplicate links
Gitea Actions Demo / Explore-Gitea-Actions (push) Successful in 4s

This commit is contained in:
2026-05-31 00:18:40 +02:00
parent c885f6cc02
commit 6c4ff8bcad
2 changed files with 63 additions and 145 deletions
+62 -144
View File
@@ -1,22 +1,22 @@
"""
Checks the database kontor
"""
from dataclasses import dataclass
from enum import Enum, auto
import json
import mariadb
import requests
from pathlib import Path
from logging import Logger
from typing import Dict, List, Optional
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from urllib.parse import urlparse
from api import Option, OptionType, Server, get_api_config, get_logger
from config import get_logger, get_database_cursors
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', '-v', action='count', default=0)
parser.add_argument('--config', '-c', default='kontor')
parser.add_argument('--file', '-f')
parser.add_argument('--dir', '-d')
parser.add_argument('--dry-run', '-m', action='store_true')
parser.add_argument('--reset-cloud-link', '-r', action='store_true')
parser.add_argument("--verbose", "-v", action="count", default=0)
parser.add_argument("--config", "-c", default="kontor-api")
parser.add_argument("--dir", "-d", default="/data/media")
parser.add_argument("--dry-run", "-m", action="store_true")
parser.add_argument("--server", "-s")
args = parser.parse_args()
class StatusType(Enum):
@@ -36,142 +36,61 @@ class FileStatus:
self.id = response['id']
def get_status_of_file(found_file: Path, cursor, log) -> FileStatus:
status = FileStatus()
try:
cursor.execute(f'SELECT id, cloud_link FROM media_file WHERE file_name="{found_file.name}"')
rows = cursor.fetchall()
if len(rows) == 1:
status.status_type = StatusType.FILE_NAME
status.id = rows[0][0]
except mariadb.Error as error:
log.debug(f'select failed with {error}')
try:
cursor.execute(f'SELECT id FROM media_file WHERE id="{found_file.stem}"')
rows = cursor.fetchall()
if len(rows) == 1:
status.status_type = StatusType.FILE_ID
status.id = rows[0][0]
if len(rows) > 1:
status.status_type = StatusType.DUPLICATE
for row in rows:
log.info(f"found {row[0]} with {found_file}")
except mariadb.Error as error:
log.debug(f'select failed with {error}')
try:
cursor.execute(f'SELECT id FROM media_file WHERE cloud_link LIKE "%{found_file.stem}%"')
rows = cursor.fetchall()
if len(rows) == 1:
status.id = rows[0][0]
if rows[0][0] == found_file.stem:
status.status_type = StatusType.CLOUD_LINK_ID
else:
status.status_type = StatusType.CLOUD_LINK
except mariadb.Error as error:
log.debug(f'select failed with {error}')
response = requests.get(f"http://127.0.0.1:8800/media/files/{found_file.stem}")
log.debug(f"Status: {response.status_code}")
if response.status_code == 200:
status.status_type = StatusType.FILE_ID
status.id = response.json()['id']
return status
def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict]:
"""
create dictionary with id as key and dictionary as value.
"""
item_id_mapping: Dict[str, dict] = {}
for data_item in data_list:
log.debug(data_item)
item_id_mapping[data_item["id"]] = data_item
return item_id_mapping
def rename_files_to_id(media_dir, dry_run, conn, log):
media_path = Path(media_dir)
cursor = conn.cursor()
for file in media_path.iterdir():
log.debug('found file: {}'.format(file.name))
status: FileStatus = get_status_of_file(file, cursor, log)
file_id = status.id
if not file_id:
log.info(f"ID of file {file.name} is unknown")
def check_duplicate_links(log: Logger, server: Server):
data = server.request(log=logger, table="media_file")
mapping = create_item_id_mapping(log=log, data_list=data)
visited_link_path: Dict[str, str] = {}
duplicate_link_paths: Dict[str, List[str]] = {}
for item in data:
link = item["url"]
if len(link) == 0:
continue
new_file_path = file.with_name(f"{file_id}{file.suffix}")
match status.status_type:
case StatusType.FILE_NAME:
log.info(f'status of {file.name} is file_name')
rename_file(file, new_file_path, dry_run, log)
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.FILE_ID:
log.info(f'status of {file.name} is file_id')
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.CLOUD_LINK:
log.info(f'status of {file.name} is cloud_link')
rename_file(file, new_file_path, dry_run, log)
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.CLOUD_LINK_ID:
log.debug(f'status of {file.name} is cloud_link_id')
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.DUPLICATE:
log.info(f'status of {file.name} is duplicate')
case StatusType.UNKNOWN:
log.info(f'status of {file.name} is unknown')
def rename_file(current_file, new_file_path, dry_run, log):
if dry_run:
log.info('rename file {} to {}'.format(current_file.name, new_file_path.name))
else:
current_file.rename(Path(new_file_path))
def update_cloud_link(file_id, file_path, conn, dry_run, log):
cursor = conn.cursor()
log.debug(f'update entry {file_id} with {file_path.absolute()}')
if dry_run:
log.debug(f'UPDATE media_file: cloud_link={file_path.absolute()}')
else:
cursor.execute('UPDATE media_file SET cloud_link="{}" WHERE id="{}"'.format(file_path.absolute(), file_id))
conn.commit()
def reset_cloud_link(conn, dry_run, log):
cursor = conn.cursor()
if dry_run:
log.info('UPDATE media_file SET cloud_link=""')
else:
cursor.execute('UPDATE media_file SET cloud_link="" WHERE id is NOT NULL')
conn.commit()
def check_file_with_db(data_file: Path, m_conn, log):
log.info(f"read json file: {data_file}")
cursor = m_conn.cursor()
with open(data_file, 'r') as json_file:
json_load = json.load(json_file)
for table in json_load:
log.info(f"{table}: {len(json_load[table])}")
items = json_load[table]
for item in items:
item_id = item['id']
select_statement = f"SELECT * FROM {table} WHERE id='{item_id}'"
cursor.execute(select_statement)
rows = cursor.fetchall()
count = len(rows)
log.info(f"{count} entries found for {item_id}")
if count == 0:
log.info(f"entry for {item_id} not found")
if count == 1:
log.info(f"check entry {item_id}")
#log.info(f"entry {rows[0]}")
columns = []
values = []
for (key, value) in item.items():
columns.append(key)
values.append(value)
for index, _ in enumerate(columns):
log.info(f"compare {values[index]} with {rows[0][index]}")
file_id = item["id"]
parsed_url = urlparse(link)
link_path = parsed_url.path
if link_path in visited_link_path:
log.info("duplicate url path found: %s", link_path)
if link_path in duplicate_link_paths:
duplicate_link_paths[link_path].append(file_id)
else:
duplicate_link_paths[link_path] = []
duplicate_link_paths[link_path].append(visited_link_path[link_path])
duplicate_link_paths[link_path].append(file_id)
else:
visited_link_path[link_path] = file_id
log.info("found %s duplicate links", len(duplicate_link_paths.keys()))
deletion_list: List[str] = []
for key, value in duplicate_link_paths.items():
if len(value) == 2:
log.info("%s:\n%s - %s\n%s - %s", key, value[0], mapping[value[0]]["url"], value[1], mapping[value[1]]["url"])
if mapping[value[0]]["url"].startswith("https://xhamster"):
deletion_list.append(value[0])
else:
deletion_list.append(value[1])
else:
log.info("found %s links", len(value))
for key in deletion_list:
log.info("%s - %s", key, mapping[key]["url"])
if __name__ == '__main__':
log = get_logger(args.verbose, args.config)
log.info("kontor.check_kontor started")
_, m_conn = get_database_cursors(log, args.config)
if args.dir:
log.info("kontor.check_kontor.rename_files_to_id")
rename_files_to_id(args.dir, args.dry_run, m_conn, log)
if args.file:
data_file = Path(args.file)
if data_file.exists():
log.info("kontor.check_kontor.check_file_with_db")
check_file_with_db(data_file, m_conn, log)
logger = get_logger(args.verbose, args.config)
logger.info("kontor.check_kontor started")
APICONFIG = get_api_config(logger, args.config)
server: Server = APICONFIG.server[0]
logger.info("kontor.check_kontor.check_duplicate_links")
check_duplicate_links(logger, server)
#logger.info("kontor.check_kontor.update_cloud_link_with_found_files")
#update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run)
#logger.info("kontor.check_kontor.get_ids_from_column_cloud_link")
@@ -179,5 +98,4 @@ if __name__ == '__main__':
#logger.info('found {} ids in column cloud_link'.format(len(link_list)))
#logger.info("kontor.check_kontor.checking_ids_from_cloud_link")
#checking_ids_from_cloud_link(link_list, mariadb_cursor)
log.info("kontor.check_kontor finished")
logger.info("kontor.check_kontor finished")