From acbf9c51a3a82b9e3fd1a4907bfdf1eddaf0bfd7 Mon Sep 17 00:00:00 2001 From: Thomas Peetz Date: Sun, 7 Sep 2025 22:47:06 +0200 Subject: [PATCH] refactor find_links.py by adding methods for specific tasks --- kontor-scripts/find_links.py | 122 ++++++++++++++++++++++++++--------- 1 file changed, 90 insertions(+), 32 deletions(-) diff --git a/kontor-scripts/find_links.py b/kontor-scripts/find_links.py index 462a26e..77707ae 100644 --- a/kontor-scripts/find_links.py +++ b/kontor-scripts/find_links.py @@ -3,7 +3,7 @@ download files with URLs from DB """ import logging.config import sys -from typing import Any, AnyStr, Dict, List +from typing import Any import requests import re from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter @@ -14,6 +14,7 @@ parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--all', '-a', action='store_true') parser.add_argument('--limit', '-l', type=int, help='maximum number of links to check') +parser.add_argument('--add-actor', action='store_true', help='add missing actors') args = parser.parse_args() @@ -64,7 +65,7 @@ def update_file(log: logging.Logger, media_file): log.debug(f"update status: {update.status_code}") log.debug(f"update result: {update.json()}") -def get_actor_links(log: logging.Logger, media_file_url: str) -> list: +def get_actor_links(log: logging.Logger, media_file_url: str) -> list[str]: try: r = requests.get(media_file_url) soup = BeautifulSoup(r.content, "html.parser") @@ -78,7 +79,7 @@ def get_actor_links(log: logging.Logger, media_file_url: str) -> list: anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")}) actor_links = [] for anchor in anchors: - link_url = anchor.get('href') + link_url = str(anchor.get("href")) # type: ignore if link_url.endswith('all/countries'): continue if link_url in actor_links: @@ -101,51 +102,82 @@ def get_media_files(all_files: bool)-> Any: data = response.json() return data -def update_media_file(item, log: logging.Logger): +def update_media_file(item, log: logging.Logger) -> Any: update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item) log.debug(f"update status: {update.status_code}") log.debug(f"update result: {update.json()}") return update.json() -def update_media_file_actors(mediafile: dict, actor_id_list: List[AnyStr], log: logging.Logger): +def update_media_file_actors(mediafile: dict, + actor_id_list: list[dict[str, str]], + actor_links: list[str], + map_ids_actor: dict[str, str], + log: logging.Logger): media_file_id = mediafile['id'] actor_response = requests.put(f"http://127.0.0.1:8800/api/media/files/{media_file_id}/actors", json=actor_id_list) - actor_data = actor_response.json() - persisted_actor_links: int = len(actor_data) - found_actor_links: int = len(actor_links) - if persisted_actor_links < found_actor_links: - log.warning(f"{persisted_actor_links} links persisted, but {found_actor_links} links are available") + files_actor_list = actor_response.json() + persisted_actor_links_count: int = len(files_actor_list) + found_actor_links_count: int = len(actor_links) + if persisted_actor_links_count < found_actor_links_count: + log.warning(f"{persisted_actor_links_count} links persisted, but {found_actor_links_count} links are available") mediafile['review'] = True - elif persisted_actor_links > found_actor_links: + elif persisted_actor_links_count > found_actor_links_count: log.warning("more persisted links than found actors") + for file_actor in files_actor_list: + actor_id = file_actor['actor_id'] + actor_url = map_ids_actor[actor_id]['url'] # type: ignore + log.debug(f"check if actor({actor_id}) with {actor_url} in list") + if actor_url not in actor_links: + log.info(f"actor not found in links, delete relation {file_actor['id']}") mediafile['review'] = True else: mediafile['review'] = False - log.debug(f"found {persisted_actor_links} actors") - log.debug(f"found actors: {actor_data}") + log.debug(f"found {persisted_actor_links_count} actors") + log.debug(f"found actors: {files_actor_list}") -def get_actor_ids(link_list: list, map_url_actor, map_ids_actor, map_path_actor, missing_actors: dict, log: logging.Logger) -> list: - found_actors: list = [] +def get_actor_ids(link_list: list[str], + map_url_actor: dict[str, str], + map_ids_actor: dict[str, str], + map_path_actor: dict[str, str], + missing_actors: dict[str, int], + log: logging.Logger) -> list[dict[str, str]]: + found_actors: list[dict[str, str]] = [] for link in link_list: - if link in map_url_actor: - actor_id = map_url_actor[link]['id'] - log.debug(f"found actor with id: {actor_id}") - found_actors.append(map_ids_actor[actor_id]) + actor = get_persisted_actor(link, map_url_actor, map_ids_actor, map_path_actor, log) + if actor: + found_actors.append(actor) else: - path = link.split('/')[-1] - if path in map_path_actor: - actor_id = map_path_actor[path]['id'] - log.debug(f"found actor with id: {actor_id} by path {path}") - found_actors.append(map_ids_actor[actor_id]) + if link in missing_actors: + count = missing_actors[link] + missing_actors[link] = count +1 else: - log.info(f"found actor {link} missing") - if link in missing_actors: - count = missing_actors[link] - missing_actors[link] = count +1 - else: - missing_actors.update({link: 1}) + missing_actors.update({link: 1}) return found_actors +def get_persisted_actor(actor_url: str, + map_url_actor: dict[str, str], + map_ids_actor: dict[str, str], + map_path_actor: dict[str, str], + log: logging.Logger) -> dict[str, str] | None: + alternate_url_actor: dict[str, dict[str, str]] = { + 'https://ge.xhamster2.com/pornstars/jean-yves-lecastel': + {'id': 'e354b866-717c-4a66-ad38-bc7c23d97e36', 'name': 'Jean-Yves Le Castel', 'url': 'https://ge.xhamster.com/pornstars/jean-yves-le-castel'}} # type: ignore + if actor_url in map_url_actor: + actor_id: str = map_url_actor[actor_url]['id'] # type: ignore + log.debug(f"found actor with id: {actor_id}") + return map_ids_actor[actor_id] # type: ignore + path = actor_url.split('/')[-1] + if path in map_path_actor: + actor_id: str = map_path_actor[path]['id'] # type: ignore + log.debug(f"found actor with id: {actor_id} by path {path}") + return map_ids_actor[actor_id] # type: ignore + if actor_url in alternate_url_actor: + actor_id: str = alternate_url_actor[actor_url]['id'] + log.info(f"found actor with id: {actor_id} by alternative {path}") + return alternate_url_actor[actor_url] + log.info(f"found actor {actor_url} missing") + return None + def get_actors(log: logging.Logger): actors_url = {} actors_id = {} @@ -167,6 +199,28 @@ def get_actors(log: logging.Logger): log.debug(f'all actors: {actors_path}') return (actors_url, actors_id, actors_path) +def get_actor_name(actor_url: str, log: logging.Logger) -> str | None: + try: + r = requests.get(actor_url) + soup = BeautifulSoup(r.content, "html.parser") + titles = soup.find_all('h1') + for title in titles: + log.info(f"title: {title.get_text()}") + return title.get_text() + except Exception as error: + log.warning(f"something went wrong: {error}") + return None + +def create_actor(actor_url: str, actor_name: str, log: logging.Logger): + new_actor = { 'name': actor_name, 'url': actor_url} + actor_response = requests.post(f"http://127.0.0.1:8800/api/media/actors", json=new_actor) + log.warning(f"add status: {actor_response.status_code}") + if actor_response.status_code == 201: + actor_data = actor_response.json() + log.warning(f"Actor {actor_data} persisted") + else: + log.info(f"Actor with {actor_url} not persisted") + if __name__ == '__main__': log = get_logger(args.verbose) @@ -188,9 +242,9 @@ if __name__ == '__main__': if str(link) == "None": continue log.warning(f"{media_file['id']} - {str(link)}") - actor_links = get_actor_links(log, link) + actor_links: list[str] = get_actor_links(log, link) actor_id_list = get_actor_ids(actor_links, actors_url, actors_id, actors_path, missing_actors, log) - update_media_file_actors(media_file, actor_id_list, log) + update_media_file_actors(media_file, actor_id_list, actor_links, actors_id, log) result = update_media_file(media_file, log) log.warning(f"processed {mediafile_index}/{entries_count}") if args.limit and args.limit <= mediafile_index: @@ -198,6 +252,10 @@ if __name__ == '__main__': mediafile_index += 1 for link in missing_actors: log.info(f"{link}: {missing_actors[link]}") + actor_name = get_actor_name(link, log) + if actor_name and args.add_actor: + create_actor(link, actor_name, log) + log.info("Sort missing actors by occurence count:") sorted_missing = dict(sorted(missing_actors.items(), key=lambda item: item[1])) for key in sorted_missing: log.info(f"{key} : {sorted_missing[key]}")