""" download files with URLs from DB """ import logging.config import os import re from typing import Any, Dict, List import requests from sqlalchemy import create_engine from sqlalchemy.orm import sessionmaker, Session from db.models.base import Base import yaml from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter from pathlib import Path from bs4 import BeautifulSoup from platformdirs import PlatformDirs from config import get_api_config from db.models.media import MediaFile parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action='count', default=0) parser.add_argument('--config', '-c', default='kontor-docker') parser.add_argument('--dry-run', '-m', help='excute script without storing', action="store_true") args = parser.parse_args() DB_USER: str = os.getenv("DB_USER", "kontor") DB_PASSWORD: str = os.getenv("DB_PASSWORD", "kontor") DB_SERVER: str = os.getenv("DB_SERVER", "127.0.0.1") DB_PORT: int = int(os.getenv("DB_PORT", 5432)) DB_DBNAME: str = os.getenv("DB_DBNAME", "kontor") DATABASE_URL: str = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_SERVER}:{DB_PORT}/{DB_DBNAME}" def get_logger(level: int, config: str): dirs = PlatformDirs(config) logging_config = Path(dirs.user_config_dir, 'logging-config.yaml') with open(logging_config, 'rt') as f: configDict = yaml.safe_load(f.read()) logging.config.dictConfig(configDict) logger = logging.getLogger('development') if level is not None: match level: case 0: logger.setLevel(logging.INFO) case 1: logger.setLevel(logging.DEBUG) case _: logger.setLevel(logging.CRITICAL) return logger def get_session() -> Session: engine = create_engine(DATABASE_URL) Base.metadata.create_all(bind=engine, checkfirst=True) SessionLocal = sessionmaker(bind=engine) return SessionLocal() def get_media_files(all_files: bool, log: logging.Logger, api_data: Dict[str, Any])-> Any: files_url = "" host = api_data["host"] port = api_data["port"] token = api_data['token'] headers: Dict[str, str] = {"Authorization": f"Bearer {token}"} if all_files: files_url= f"http://{host}:{port}/api/media/files" else: files_url = f"http://{host}:{port}/api/media/files?review=true" response = requests.get(files_url, headers=headers) log.debug(f"Status: {response.status_code}") data = response.json() return data def update_media_file(media_file: MediaFile, log: logging.Logger, api_data: Dict[str, Any]) -> Any: host = api_data["host"] port = api_data["port"] token = api_data['token'] url: str = f"http://{host}:{port}/api/media/files/{media_file.id}" headers: Dict[str, str] = {"Authorization": f"Bearer {token}"} item: Dict[str, Any] = {} item['id'] = media_file.id item['title'] = media_file.title item['file_name'] = media_file.file_name item['cloud_link'] = media_file.cloud_link item['url'] = media_file.url item['review'] = media_file.review item['should_download'] = media_file.should_download update = requests.put(url, headers=headers, json=item) log.debug(f"update status: {update.status_code}") log.debug(f"update result: {update.json()}") return update.json() def get_meta_info(media_file: MediaFile, log) -> List[str]: actor_links: List[str] = [] try: r = requests.get(media_file.url) soup = BeautifulSoup(r.content, "html.parser") error404 = soup.css.select_one('.error404-title') if error404 and error404.get_text() == "Video nicht gefunden": log.warning(f"{error404.get_text()}") media_file.url = None media_file.review = False return actor_links title_tag = soup.find('title') if title_tag: media_file.title = title_tag.get_text() media_file.review = False anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")}) for anchor in anchors: link_url = str(anchor.get("href")) # type: ignore if link_url.endswith('all/countries'): continue if link_url in actor_links: continue actor_links.append(link_url) except Exception as error: log.info(f"something went wrong: {error}") media_file.title = None media_file.review = True log.info(f"update MediaFile with MetaInfos to {repr(media_file)}") log.info(f"links({len(actor_links)}): {actor_links}") return actor_links if __name__ == '__main__': log = get_logger(args.verbose, args.config) log.info('kontor.update_titles started') api_data = get_api_config(log, args.config) data = get_media_files(False, log, api_data=api_data) log.info(f"data: {len(data)}") for item in data: link = item['url'] log.info(f"{item['id']} - {str(link)}") try: r = requests.get(link) soup = BeautifulSoup(r.content, "html.parser") title = soup.title.string item['title'] = title item['review'] = False except Exception as error: log.info(f"something went wrong: {error}") item['title'] = None item['review'] = True update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item) log.info(f"update status: {update.status_code}") log.info(f"update result: {update.json()}") log.info('kontor.update_titles finished')