Files
kontor/kontor-scripts/check_kontor.py
T
tpeetz 6c4ff8bcad
Gitea Actions Demo / Explore-Gitea-Actions (push) Successful in 4s
check for duplicate links
2026-05-31 00:18:40 +02:00

102 lines
3.8 KiB
Python

"""
Checks the database kontor
"""
from dataclasses import dataclass
from enum import Enum, auto
from logging import Logger
from typing import Dict, List, Optional
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from urllib.parse import urlparse
from api import Option, OptionType, Server, get_api_config, get_logger
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("--verbose", "-v", action="count", default=0)
parser.add_argument("--config", "-c", default="kontor-api")
parser.add_argument("--dir", "-d", default="/data/media")
parser.add_argument("--dry-run", "-m", action="store_true")
parser.add_argument("--server", "-s")
args = parser.parse_args()
class StatusType(Enum):
UNKNOWN = auto()
FILE_NAME = auto()
FILE_ID = auto()
DUPLICATE = auto()
CLOUD_LINK = auto()
CLOUD_LINK_ID = auto()
class FileStatus:
id: str | None = None
status_type: StatusType = StatusType.UNKNOWN
def get_response(self, response: dict):
self.status_type = StatusType.FILE_NAME
self.id = response['id']
def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict]:
"""
create dictionary with id as key and dictionary as value.
"""
item_id_mapping: Dict[str, dict] = {}
for data_item in data_list:
log.debug(data_item)
item_id_mapping[data_item["id"]] = data_item
return item_id_mapping
def check_duplicate_links(log: Logger, server: Server):
data = server.request(log=logger, table="media_file")
mapping = create_item_id_mapping(log=log, data_list=data)
visited_link_path: Dict[str, str] = {}
duplicate_link_paths: Dict[str, List[str]] = {}
for item in data:
link = item["url"]
if len(link) == 0:
continue
file_id = item["id"]
parsed_url = urlparse(link)
link_path = parsed_url.path
if link_path in visited_link_path:
log.info("duplicate url path found: %s", link_path)
if link_path in duplicate_link_paths:
duplicate_link_paths[link_path].append(file_id)
else:
duplicate_link_paths[link_path] = []
duplicate_link_paths[link_path].append(visited_link_path[link_path])
duplicate_link_paths[link_path].append(file_id)
else:
visited_link_path[link_path] = file_id
log.info("found %s duplicate links", len(duplicate_link_paths.keys()))
deletion_list: List[str] = []
for key, value in duplicate_link_paths.items():
if len(value) == 2:
log.info("%s:\n%s - %s\n%s - %s", key, value[0], mapping[value[0]]["url"], value[1], mapping[value[1]]["url"])
if mapping[value[0]]["url"].startswith("https://xhamster"):
deletion_list.append(value[0])
else:
deletion_list.append(value[1])
else:
log.info("found %s links", len(value))
for key in deletion_list:
log.info("%s - %s", key, mapping[key]["url"])
if __name__ == '__main__':
logger = get_logger(args.verbose, args.config)
logger.info("kontor.check_kontor started")
APICONFIG = get_api_config(logger, args.config)
server: Server = APICONFIG.server[0]
logger.info("kontor.check_kontor.check_duplicate_links")
check_duplicate_links(logger, server)
#logger.info("kontor.check_kontor.update_cloud_link_with_found_files")
#update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run)
#logger.info("kontor.check_kontor.get_ids_from_column_cloud_link")
#get_ids_from_column_cloud_link(link_list, mariadb_cursor)
#logger.info('found {} ids in column cloud_link'.format(len(link_list)))
#logger.info("kontor.check_kontor.checking_ids_from_cloud_link")
#checking_ids_from_cloud_link(link_list, mariadb_cursor)
logger.info("kontor.check_kontor finished")