""" download files with URLs from DB """ import logging.config import requests import re from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from bs4 import BeautifulSoup parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--all', '-a', action='store_true') args = parser.parse_args() def get_logger(level: int) -> logging.Logger: logging.config.dictConfig({ 'version': 1, 'disable_existing_loggers': False, 'formatters': { 'simple': { 'format': '[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', 'datefmt': '%Y-%m-%d %H:%M:%S', }, }, 'handlers': { 'console': { 'class': logging.StreamHandler, 'level': logging.DEBUG, 'formatter': 'simple', 'stream': 'ext://sys.stdout' }, }, 'loggers': { 'urllib3.connectionpool': { 'level': 'WARNING', 'propagate': False, }, 'root': { 'level': 'DEBUG', 'handlers': ['console'], }, }, }) logger = logging.getLogger(__file__) if level is not None: match level: case 0: logger.setLevel(logging.INFO) case 1: logger.setLevel(logging.DEBUG) case _: logger.setLevel(logging.CRITICAL) return logger def update_file(log: logging.Logger, media_file): update = requests.put(f"http://127.0.0.1:8800/api/media/files/{media_file['id']}", json=media_file) log.info(f"update status: {update.status_code}") log.info(f"update result: {update.json()}") def get_actor_links(log: logging.Logger, media_file_url: str) -> list: try: r = requests.get(media_file_url) soup = BeautifulSoup(r.content, "html.parser") error404 = soup.css.select_one('.error404-title') if error404 and error404.get_text() == "Video nicht gefunden": log.info(f"{error404.get_text()}") item['url'] = None item['review'] = False update_file(log, item) return [] anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")}) actor_links = [] for anchor in anchors: link_url = anchor.get('href') if link_url.endswith('all/countries'): continue actor_links.append(link_url) log.info(f"links({len(actor_links)}): {actor_links}") return actor_links except Exception as error: log.info(f"something went wrong: {error}") return [] if __name__ == '__main__': log = get_logger(args.verbose) log.info('kontor.find_links started') log.info('get all actors') response = requests.get("http://127.0.0.1:8800/api/media/actors") data = response.json() actors = {} for item in data: actor = {} actor['id'] = item['id'] actor['name'] = item['name'] actor['url'] = item['url'] actors[item['url']] = actor log.debug(f'all actors: {actors}') files_url = "" if args.all: files_url= "http://127.0.0.1:8800/api/media/files" else: files_url = "http://127.0.0.1:8800/api/media/files?review=true" response = requests.get(files_url) log.info(f"Status: {response.status_code}") data = response.json() log.info(f"data: {len(data)}") for item in data: link = item['url'] if not link: continue if str(link) == "None": continue log.info(f"{item['id']} - {str(link)}") actor_links = get_actor_links(log, link) actor_list = [] for actor_link in actor_links: if actor_link in actors: log.info(f"found actor with id: {actors[actor_link]['id']}") actor_list.append(actors[actor_link]) actor_response = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}/actors", json=actor_list) actor_data = actor_response.json() log.info(f"found {len(actor_data)} actors") log.info(f"found actors: {actor_data}") item['review'] = False update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item) log.info(f"update status: {update.status_code}") log.info(f"update result: {update.json()}") log.info('kontor.find_links finished')