check for duplicate links
Gitea Actions Demo / Explore-Gitea-Actions (push) Successful in 4s

This commit is contained in:
2026-05-31 00:18:40 +02:00
parent c885f6cc02
commit 6c4ff8bcad
2 changed files with 63 additions and 145 deletions
+59 -141
View File
@@ -1,22 +1,22 @@
""" """
Checks the database kontor Checks the database kontor
""" """
from dataclasses import dataclass
from enum import Enum, auto from enum import Enum, auto
import json from logging import Logger
import mariadb from typing import Dict, List, Optional
import requests
from pathlib import Path
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from urllib.parse import urlparse
from api import Option, OptionType, Server, get_api_config, get_logger
from config import get_logger, get_database_cursors
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument("--verbose", "-v", action="count", default=0)
parser.add_argument('--config', '-c', default='kontor') parser.add_argument("--config", "-c", default="kontor-api")
parser.add_argument('--file', '-f') parser.add_argument("--dir", "-d", default="/data/media")
parser.add_argument('--dir', '-d') parser.add_argument("--dry-run", "-m", action="store_true")
parser.add_argument('--dry-run', '-m', action='store_true') parser.add_argument("--server", "-s")
parser.add_argument('--reset-cloud-link', '-r', action='store_true')
args = parser.parse_args() args = parser.parse_args()
class StatusType(Enum): class StatusType(Enum):
@@ -36,142 +36,61 @@ class FileStatus:
self.id = response['id'] self.id = response['id']
def get_status_of_file(found_file: Path, cursor, log) -> FileStatus: def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict]:
status = FileStatus() """
try: create dictionary with id as key and dictionary as value.
cursor.execute(f'SELECT id, cloud_link FROM media_file WHERE file_name="{found_file.name}"') """
rows = cursor.fetchall() item_id_mapping: Dict[str, dict] = {}
if len(rows) == 1: for data_item in data_list:
status.status_type = StatusType.FILE_NAME log.debug(data_item)
status.id = rows[0][0] item_id_mapping[data_item["id"]] = data_item
except mariadb.Error as error: return item_id_mapping
log.debug(f'select failed with {error}')
try:
cursor.execute(f'SELECT id FROM media_file WHERE id="{found_file.stem}"')
rows = cursor.fetchall()
if len(rows) == 1:
status.status_type = StatusType.FILE_ID
status.id = rows[0][0]
if len(rows) > 1:
status.status_type = StatusType.DUPLICATE
for row in rows:
log.info(f"found {row[0]} with {found_file}")
except mariadb.Error as error:
log.debug(f'select failed with {error}')
try:
cursor.execute(f'SELECT id FROM media_file WHERE cloud_link LIKE "%{found_file.stem}%"')
rows = cursor.fetchall()
if len(rows) == 1:
status.id = rows[0][0]
if rows[0][0] == found_file.stem:
status.status_type = StatusType.CLOUD_LINK_ID
else:
status.status_type = StatusType.CLOUD_LINK
except mariadb.Error as error:
log.debug(f'select failed with {error}')
response = requests.get(f"http://127.0.0.1:8800/media/files/{found_file.stem}")
log.debug(f"Status: {response.status_code}")
if response.status_code == 200:
status.status_type = StatusType.FILE_ID
status.id = response.json()['id']
return status
def rename_files_to_id(media_dir, dry_run, conn, log):
media_path = Path(media_dir) def check_duplicate_links(log: Logger, server: Server):
cursor = conn.cursor() data = server.request(log=logger, table="media_file")
for file in media_path.iterdir(): mapping = create_item_id_mapping(log=log, data_list=data)
log.debug('found file: {}'.format(file.name)) visited_link_path: Dict[str, str] = {}
status: FileStatus = get_status_of_file(file, cursor, log) duplicate_link_paths: Dict[str, List[str]] = {}
file_id = status.id for item in data:
if not file_id: link = item["url"]
log.info(f"ID of file {file.name} is unknown") if len(link) == 0:
continue continue
new_file_path = file.with_name(f"{file_id}{file.suffix}") file_id = item["id"]
match status.status_type: parsed_url = urlparse(link)
case StatusType.FILE_NAME: link_path = parsed_url.path
log.info(f'status of {file.name} is file_name') if link_path in visited_link_path:
rename_file(file, new_file_path, dry_run, log) log.info("duplicate url path found: %s", link_path)
update_cloud_link(file_id, new_file_path, conn, dry_run, log) if link_path in duplicate_link_paths:
case StatusType.FILE_ID: duplicate_link_paths[link_path].append(file_id)
log.info(f'status of {file.name} is file_id')
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.CLOUD_LINK:
log.info(f'status of {file.name} is cloud_link')
rename_file(file, new_file_path, dry_run, log)
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.CLOUD_LINK_ID:
log.debug(f'status of {file.name} is cloud_link_id')
update_cloud_link(file_id, new_file_path, conn, dry_run, log)
case StatusType.DUPLICATE:
log.info(f'status of {file.name} is duplicate')
case StatusType.UNKNOWN:
log.info(f'status of {file.name} is unknown')
def rename_file(current_file, new_file_path, dry_run, log):
if dry_run:
log.info('rename file {} to {}'.format(current_file.name, new_file_path.name))
else: else:
current_file.rename(Path(new_file_path)) duplicate_link_paths[link_path] = []
duplicate_link_paths[link_path].append(visited_link_path[link_path])
def update_cloud_link(file_id, file_path, conn, dry_run, log): duplicate_link_paths[link_path].append(file_id)
cursor = conn.cursor()
log.debug(f'update entry {file_id} with {file_path.absolute()}')
if dry_run:
log.debug(f'UPDATE media_file: cloud_link={file_path.absolute()}')
else: else:
cursor.execute('UPDATE media_file SET cloud_link="{}" WHERE id="{}"'.format(file_path.absolute(), file_id)) visited_link_path[link_path] = file_id
conn.commit() log.info("found %s duplicate links", len(duplicate_link_paths.keys()))
deletion_list: List[str] = []
def reset_cloud_link(conn, dry_run, log): for key, value in duplicate_link_paths.items():
cursor = conn.cursor() if len(value) == 2:
if dry_run: log.info("%s:\n%s - %s\n%s - %s", key, value[0], mapping[value[0]]["url"], value[1], mapping[value[1]]["url"])
log.info('UPDATE media_file SET cloud_link=""') if mapping[value[0]]["url"].startswith("https://xhamster"):
deletion_list.append(value[0])
else: else:
cursor.execute('UPDATE media_file SET cloud_link="" WHERE id is NOT NULL') deletion_list.append(value[1])
conn.commit() else:
log.info("found %s links", len(value))
def check_file_with_db(data_file: Path, m_conn, log): for key in deletion_list:
log.info(f"read json file: {data_file}") log.info("%s - %s", key, mapping[key]["url"])
cursor = m_conn.cursor()
with open(data_file, 'r') as json_file:
json_load = json.load(json_file)
for table in json_load:
log.info(f"{table}: {len(json_load[table])}")
items = json_load[table]
for item in items:
item_id = item['id']
select_statement = f"SELECT * FROM {table} WHERE id='{item_id}'"
cursor.execute(select_statement)
rows = cursor.fetchall()
count = len(rows)
log.info(f"{count} entries found for {item_id}")
if count == 0:
log.info(f"entry for {item_id} not found")
if count == 1:
log.info(f"check entry {item_id}")
#log.info(f"entry {rows[0]}")
columns = []
values = []
for (key, value) in item.items():
columns.append(key)
values.append(value)
for index, _ in enumerate(columns):
log.info(f"compare {values[index]} with {rows[0][index]}")
if __name__ == '__main__': if __name__ == '__main__':
log = get_logger(args.verbose, args.config) logger = get_logger(args.verbose, args.config)
log.info("kontor.check_kontor started") logger.info("kontor.check_kontor started")
_, m_conn = get_database_cursors(log, args.config) APICONFIG = get_api_config(logger, args.config)
if args.dir: server: Server = APICONFIG.server[0]
log.info("kontor.check_kontor.rename_files_to_id") logger.info("kontor.check_kontor.check_duplicate_links")
rename_files_to_id(args.dir, args.dry_run, m_conn, log) check_duplicate_links(logger, server)
if args.file:
data_file = Path(args.file)
if data_file.exists():
log.info("kontor.check_kontor.check_file_with_db")
check_file_with_db(data_file, m_conn, log)
#logger.info("kontor.check_kontor.update_cloud_link_with_found_files") #logger.info("kontor.check_kontor.update_cloud_link_with_found_files")
#update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run) #update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run)
#logger.info("kontor.check_kontor.get_ids_from_column_cloud_link") #logger.info("kontor.check_kontor.get_ids_from_column_cloud_link")
@@ -179,5 +98,4 @@ if __name__ == '__main__':
#logger.info('found {} ids in column cloud_link'.format(len(link_list))) #logger.info('found {} ids in column cloud_link'.format(len(link_list)))
#logger.info("kontor.check_kontor.checking_ids_from_cloud_link") #logger.info("kontor.check_kontor.checking_ids_from_cloud_link")
#checking_ids_from_cloud_link(link_list, mariadb_cursor) #checking_ids_from_cloud_link(link_list, mariadb_cursor)
log.info("kontor.check_kontor finished") logger.info("kontor.check_kontor finished")