102 lines
3.8 KiB
Python
102 lines
3.8 KiB
Python
"""
|
|
Checks the database kontor
|
|
"""
|
|
from dataclasses import dataclass
|
|
from enum import Enum, auto
|
|
from logging import Logger
|
|
from typing import Dict, List, Optional
|
|
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
|
|
from urllib.parse import urlparse
|
|
|
|
from api import Option, OptionType, Server, get_api_config, get_logger
|
|
|
|
|
|
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
|
|
parser.add_argument("--verbose", "-v", action="count", default=0)
|
|
parser.add_argument("--config", "-c", default="kontor-api")
|
|
parser.add_argument("--dir", "-d", default="/data/media")
|
|
parser.add_argument("--dry-run", "-m", action="store_true")
|
|
parser.add_argument("--server", "-s")
|
|
args = parser.parse_args()
|
|
|
|
class StatusType(Enum):
|
|
UNKNOWN = auto()
|
|
FILE_NAME = auto()
|
|
FILE_ID = auto()
|
|
DUPLICATE = auto()
|
|
CLOUD_LINK = auto()
|
|
CLOUD_LINK_ID = auto()
|
|
|
|
class FileStatus:
|
|
id: str | None = None
|
|
status_type: StatusType = StatusType.UNKNOWN
|
|
|
|
def get_response(self, response: dict):
|
|
self.status_type = StatusType.FILE_NAME
|
|
self.id = response['id']
|
|
|
|
|
|
def create_item_id_mapping(log: Logger, data_list: List[dict]) -> Dict[str, dict]:
|
|
"""
|
|
create dictionary with id as key and dictionary as value.
|
|
"""
|
|
item_id_mapping: Dict[str, dict] = {}
|
|
for data_item in data_list:
|
|
log.debug(data_item)
|
|
item_id_mapping[data_item["id"]] = data_item
|
|
return item_id_mapping
|
|
|
|
|
|
def check_duplicate_links(log: Logger, server: Server):
|
|
data = server.request(log=logger, table="media_file")
|
|
mapping = create_item_id_mapping(log=log, data_list=data)
|
|
visited_link_path: Dict[str, str] = {}
|
|
duplicate_link_paths: Dict[str, List[str]] = {}
|
|
for item in data:
|
|
link = item["url"]
|
|
if len(link) == 0:
|
|
continue
|
|
file_id = item["id"]
|
|
parsed_url = urlparse(link)
|
|
link_path = parsed_url.path
|
|
if link_path in visited_link_path:
|
|
log.info("duplicate url path found: %s", link_path)
|
|
if link_path in duplicate_link_paths:
|
|
duplicate_link_paths[link_path].append(file_id)
|
|
else:
|
|
duplicate_link_paths[link_path] = []
|
|
duplicate_link_paths[link_path].append(visited_link_path[link_path])
|
|
duplicate_link_paths[link_path].append(file_id)
|
|
else:
|
|
visited_link_path[link_path] = file_id
|
|
log.info("found %s duplicate links", len(duplicate_link_paths.keys()))
|
|
deletion_list: List[str] = []
|
|
for key, value in duplicate_link_paths.items():
|
|
if len(value) == 2:
|
|
log.info("%s:\n%s - %s\n%s - %s", key, value[0], mapping[value[0]]["url"], value[1], mapping[value[1]]["url"])
|
|
if mapping[value[0]]["url"].startswith("https://xhamster"):
|
|
deletion_list.append(value[0])
|
|
else:
|
|
deletion_list.append(value[1])
|
|
else:
|
|
log.info("found %s links", len(value))
|
|
for key in deletion_list:
|
|
log.info("%s - %s", key, mapping[key]["url"])
|
|
|
|
|
|
if __name__ == '__main__':
|
|
logger = get_logger(args.verbose, args.config)
|
|
logger.info("kontor.check_kontor started")
|
|
APICONFIG = get_api_config(logger, args.config)
|
|
server: Server = APICONFIG.server[0]
|
|
logger.info("kontor.check_kontor.check_duplicate_links")
|
|
check_duplicate_links(logger, server)
|
|
#logger.info("kontor.check_kontor.update_cloud_link_with_found_files")
|
|
#update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run)
|
|
#logger.info("kontor.check_kontor.get_ids_from_column_cloud_link")
|
|
#get_ids_from_column_cloud_link(link_list, mariadb_cursor)
|
|
#logger.info('found {} ids in column cloud_link'.format(len(link_list)))
|
|
#logger.info("kontor.check_kontor.checking_ids_from_cloud_link")
|
|
#checking_ids_from_cloud_link(link_list, mariadb_cursor)
|
|
logger.info("kontor.check_kontor finished")
|