refactor find_links.py by adding methods for specific tasks

This commit is contained in:
Thomas Peetz
2025-09-07 22:47:06 +02:00
parent 5bfea51b27
commit acbf9c51a3
+90 -32
View File
@@ -3,7 +3,7 @@ download files with URLs from DB
"""
import logging.config
import sys
from typing import Any, AnyStr, Dict, List
from typing import Any
import requests
import re
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
@@ -14,6 +14,7 @@ parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', '-v', action='count', default=0)
parser.add_argument('--all', '-a', action='store_true')
parser.add_argument('--limit', '-l', type=int, help='maximum number of links to check')
parser.add_argument('--add-actor', action='store_true', help='add missing actors')
args = parser.parse_args()
@@ -64,7 +65,7 @@ def update_file(log: logging.Logger, media_file):
log.debug(f"update status: {update.status_code}")
log.debug(f"update result: {update.json()}")
def get_actor_links(log: logging.Logger, media_file_url: str) -> list:
def get_actor_links(log: logging.Logger, media_file_url: str) -> list[str]:
try:
r = requests.get(media_file_url)
soup = BeautifulSoup(r.content, "html.parser")
@@ -78,7 +79,7 @@ def get_actor_links(log: logging.Logger, media_file_url: str) -> list:
anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")})
actor_links = []
for anchor in anchors:
link_url = anchor.get('href')
link_url = str(anchor.get("href")) # type: ignore
if link_url.endswith('all/countries'):
continue
if link_url in actor_links:
@@ -101,51 +102,82 @@ def get_media_files(all_files: bool)-> Any:
data = response.json()
return data
def update_media_file(item, log: logging.Logger):
def update_media_file(item, log: logging.Logger) -> Any:
update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item)
log.debug(f"update status: {update.status_code}")
log.debug(f"update result: {update.json()}")
return update.json()
def update_media_file_actors(mediafile: dict, actor_id_list: List[AnyStr], log: logging.Logger):
def update_media_file_actors(mediafile: dict,
actor_id_list: list[dict[str, str]],
actor_links: list[str],
map_ids_actor: dict[str, str],
log: logging.Logger):
media_file_id = mediafile['id']
actor_response = requests.put(f"http://127.0.0.1:8800/api/media/files/{media_file_id}/actors", json=actor_id_list)
actor_data = actor_response.json()
persisted_actor_links: int = len(actor_data)
found_actor_links: int = len(actor_links)
if persisted_actor_links < found_actor_links:
log.warning(f"{persisted_actor_links} links persisted, but {found_actor_links} links are available")
files_actor_list = actor_response.json()
persisted_actor_links_count: int = len(files_actor_list)
found_actor_links_count: int = len(actor_links)
if persisted_actor_links_count < found_actor_links_count:
log.warning(f"{persisted_actor_links_count} links persisted, but {found_actor_links_count} links are available")
mediafile['review'] = True
elif persisted_actor_links > found_actor_links:
elif persisted_actor_links_count > found_actor_links_count:
log.warning("more persisted links than found actors")
for file_actor in files_actor_list:
actor_id = file_actor['actor_id']
actor_url = map_ids_actor[actor_id]['url'] # type: ignore
log.debug(f"check if actor({actor_id}) with {actor_url} in list")
if actor_url not in actor_links:
log.info(f"actor not found in links, delete relation {file_actor['id']}")
mediafile['review'] = True
else:
mediafile['review'] = False
log.debug(f"found {persisted_actor_links} actors")
log.debug(f"found actors: {actor_data}")
log.debug(f"found {persisted_actor_links_count} actors")
log.debug(f"found actors: {files_actor_list}")
def get_actor_ids(link_list: list, map_url_actor, map_ids_actor, map_path_actor, missing_actors: dict, log: logging.Logger) -> list:
found_actors: list = []
def get_actor_ids(link_list: list[str],
map_url_actor: dict[str, str],
map_ids_actor: dict[str, str],
map_path_actor: dict[str, str],
missing_actors: dict[str, int],
log: logging.Logger) -> list[dict[str, str]]:
found_actors: list[dict[str, str]] = []
for link in link_list:
if link in map_url_actor:
actor_id = map_url_actor[link]['id']
log.debug(f"found actor with id: {actor_id}")
found_actors.append(map_ids_actor[actor_id])
actor = get_persisted_actor(link, map_url_actor, map_ids_actor, map_path_actor, log)
if actor:
found_actors.append(actor)
else:
path = link.split('/')[-1]
if path in map_path_actor:
actor_id = map_path_actor[path]['id']
log.debug(f"found actor with id: {actor_id} by path {path}")
found_actors.append(map_ids_actor[actor_id])
if link in missing_actors:
count = missing_actors[link]
missing_actors[link] = count +1
else:
log.info(f"found actor {link} missing")
if link in missing_actors:
count = missing_actors[link]
missing_actors[link] = count +1
else:
missing_actors.update({link: 1})
missing_actors.update({link: 1})
return found_actors
def get_persisted_actor(actor_url: str,
map_url_actor: dict[str, str],
map_ids_actor: dict[str, str],
map_path_actor: dict[str, str],
log: logging.Logger) -> dict[str, str] | None:
alternate_url_actor: dict[str, dict[str, str]] = {
'https://ge.xhamster2.com/pornstars/jean-yves-lecastel':
{'id': 'e354b866-717c-4a66-ad38-bc7c23d97e36', 'name': 'Jean-Yves Le Castel', 'url': 'https://ge.xhamster.com/pornstars/jean-yves-le-castel'}} # type: ignore
if actor_url in map_url_actor:
actor_id: str = map_url_actor[actor_url]['id'] # type: ignore
log.debug(f"found actor with id: {actor_id}")
return map_ids_actor[actor_id] # type: ignore
path = actor_url.split('/')[-1]
if path in map_path_actor:
actor_id: str = map_path_actor[path]['id'] # type: ignore
log.debug(f"found actor with id: {actor_id} by path {path}")
return map_ids_actor[actor_id] # type: ignore
if actor_url in alternate_url_actor:
actor_id: str = alternate_url_actor[actor_url]['id']
log.info(f"found actor with id: {actor_id} by alternative {path}")
return alternate_url_actor[actor_url]
log.info(f"found actor {actor_url} missing")
return None
def get_actors(log: logging.Logger):
actors_url = {}
actors_id = {}
@@ -167,6 +199,28 @@ def get_actors(log: logging.Logger):
log.debug(f'all actors: {actors_path}')
return (actors_url, actors_id, actors_path)
def get_actor_name(actor_url: str, log: logging.Logger) -> str | None:
try:
r = requests.get(actor_url)
soup = BeautifulSoup(r.content, "html.parser")
titles = soup.find_all('h1')
for title in titles:
log.info(f"title: {title.get_text()}")
return title.get_text()
except Exception as error:
log.warning(f"something went wrong: {error}")
return None
def create_actor(actor_url: str, actor_name: str, log: logging.Logger):
new_actor = { 'name': actor_name, 'url': actor_url}
actor_response = requests.post(f"http://127.0.0.1:8800/api/media/actors", json=new_actor)
log.warning(f"add status: {actor_response.status_code}")
if actor_response.status_code == 201:
actor_data = actor_response.json()
log.warning(f"Actor {actor_data} persisted")
else:
log.info(f"Actor with {actor_url} not persisted")
if __name__ == '__main__':
log = get_logger(args.verbose)
@@ -188,9 +242,9 @@ if __name__ == '__main__':
if str(link) == "None":
continue
log.warning(f"{media_file['id']} - {str(link)}")
actor_links = get_actor_links(log, link)
actor_links: list[str] = get_actor_links(log, link)
actor_id_list = get_actor_ids(actor_links, actors_url, actors_id, actors_path, missing_actors, log)
update_media_file_actors(media_file, actor_id_list, log)
update_media_file_actors(media_file, actor_id_list, actor_links, actors_id, log)
result = update_media_file(media_file, log)
log.warning(f"processed {mediafile_index}/{entries_count}")
if args.limit and args.limit <= mediafile_index:
@@ -198,6 +252,10 @@ if __name__ == '__main__':
mediafile_index += 1
for link in missing_actors:
log.info(f"{link}: {missing_actors[link]}")
actor_name = get_actor_name(link, log)
if actor_name and args.add_actor:
create_actor(link, actor_name, log)
log.info("Sort missing actors by occurence count:")
sorted_missing = dict(sorted(missing_actors.items(), key=lambda item: item[1]))
for key in sorted_missing:
log.info(f"{key} : {sorted_missing[key]}")