From a6a03e3f047492699ad7de2f69078b824de51606 Mon Sep 17 00:00:00 2001 From: Thomas Peetz Date: Tue, 19 Aug 2025 08:55:45 +0200 Subject: [PATCH] fix problem in download.py when title has not been set --- kontor-api/src/apis/base.py | 5 +- kontor-api/src/apis/version1/mediaactor.py | 21 +++ .../apis/version1/{media.py => mediafile.py} | 47 +++++- kontor-api/src/db/models/base.py | 6 +- kontor-api/src/db/models/media.py | 7 +- kontor-api/src/db/repository/media.py | 15 +- kontor-api/src/schema/media/actor.py | 10 ++ kontor-api/src/schema/media/actorfile.py | 10 ++ kontor-api/src/schema/media/file.py | 4 +- kontor-scripts/db/models/base.py | 6 +- kontor-scripts/find_links.py | 134 +++++++++++++----- kontor-scripts/update_title.py | 5 + .../thpeetz/kontor/media/data/MediaFile.java | 1 - 13 files changed, 221 insertions(+), 50 deletions(-) create mode 100644 kontor-api/src/apis/version1/mediaactor.py rename kontor-api/src/apis/version1/{media.py => mediafile.py} (54%) create mode 100644 kontor-api/src/schema/media/actor.py create mode 100644 kontor-api/src/schema/media/actorfile.py diff --git a/kontor-api/src/apis/base.py b/kontor-api/src/apis/base.py index c632977..9141715 100644 --- a/kontor-api/src/apis/base.py +++ b/kontor-api/src/apis/base.py @@ -1,9 +1,10 @@ from fastapi import APIRouter -from src.apis.version1 import comic, media, tysc, admin +from src.apis.version1 import comic, mediaactor, mediafile, tysc, admin api_router = APIRouter(prefix="/api") api_router.include_router(comic.router, prefix="/comics", tags=["comics"]) -api_router.include_router(media.router, prefix="/media", tags=["media"]) +api_router.include_router(mediafile.router, prefix="/media", tags=["media"]) +api_router.include_router(mediaactor.router, prefix="/media", tags=["media"]) api_router.include_router(tysc.router, prefix="/tysc", tags=["tysc"]) api_router.include_router(admin.router, prefix="/login", tags=["login"]) diff --git a/kontor-api/src/apis/version1/mediaactor.py b/kontor-api/src/apis/version1/mediaactor.py new file mode 100644 index 0000000..facf69c --- /dev/null +++ b/kontor-api/src/apis/version1/mediaactor.py @@ -0,0 +1,21 @@ +from typing import List, AnyStr + +from fastapi import APIRouter, status, HTTPException, Depends +from sqlalchemy import select, Sequence +from src.core.log_conf import logger +from src.apis.utils import SessionDep +from src.db.repository.media import create_new_mediafile +from src.schema.media.actor import MediaActorResponse +from src.db.models.media import MediaActor + +router = APIRouter() + +@router.get("/actors", response_model=List[MediaActorResponse]) +#def get_all_files(db: SessionDep, review: bool = False, download: bool = False, current_user: Profile = Depends(get_current_user_from_token)) -> List[MediaFileResponse]: +def get_all_files(db: SessionDep, review: bool = False, download: bool = False) -> List[MediaActorResponse]: + results: List[MediaActorResponse] = [] + actors = db.scalars(select(MediaActor)).all() + for mediaactor in actors: + response = MediaActorResponse(id=mediaactor.id, name=str(mediaactor.name), url=str(mediaactor.url)) + results.append(response) + return results diff --git a/kontor-api/src/apis/version1/media.py b/kontor-api/src/apis/version1/mediafile.py similarity index 54% rename from kontor-api/src/apis/version1/media.py rename to kontor-api/src/apis/version1/mediafile.py index 1014b1e..3a672bc 100644 --- a/kontor-api/src/apis/version1/media.py +++ b/kontor-api/src/apis/version1/mediafile.py @@ -4,7 +4,9 @@ from fastapi import APIRouter, status, HTTPException, Depends from sqlalchemy import select, Sequence from src.core.log_conf import logger from src.apis.utils import SessionDep -from src.db.repository.media import create_new_mediafile +from src.db.repository.media import create_new_mediaactorfile, create_new_mediafile +from src.schema.media.actor import MediaActorResponse +from src.schema.media.actorfile import MediaActorFileResponse from src.schema.media.file import MediaFileResponse, Link, get_file_details, set_file from src.db.models.media import MediaFile @@ -47,6 +49,43 @@ def get_file(file_id: AnyStr, db: SessionDep) -> MediaFileResponse: response = get_file_details(mediafile) return response +@router.get("/files/{file_id}/actors", response_model=List[MediaActorResponse]) +def get_file_actors(file_id: AnyStr, db: SessionDep) -> List[MediaActorResponse]: + mediafile = db.get(MediaFile, file_id) + if not mediafile: + raise HTTPException(status_code=404, detail="MediaFile could not be found") + actor_files = mediafile.media_actor_files + logger.info(f"already known actors: {actor_files}") + results: List[MediaActorResponse] = [] + for actor_file in actor_files: + response = MediaActorResponse(id=actor_file.media_actor.id, name=actor_file.media_actor.name, url=actor_file.media_actor.url) + results.append(response) + return results + +@router.put("/files/{file_id}/actors", response_model=List[MediaActorFileResponse]) +def update_file_actors(file_id: AnyStr, db: SessionDep, actors: List[MediaActorResponse]) -> List[MediaActorFileResponse]: + mediafile = db.get(MediaFile, file_id) + if not mediafile: + raise HTTPException(status_code=404, detail="MediaFile could not be found") + actor_files = mediafile.media_actor_files + logger.info(f"already known actors: {actor_files}") + for actor in actors: + already_associated = False + for actor_file in actor_files: + if actor.id == actor_file.media_actor_id: + logger.info("alreay associated - do nothing") + already_associated = True + break + if not already_associated: + create_new_mediaactorfile(db, actor.id, mediafile.id) + db.refresh(mediafile) + actor_files = mediafile.media_actor_files + results: List[MediaActorFileResponse] = [] + for actor_file in actor_files: + response = MediaActorFileResponse(id=actor_file.id, actor_id=actor_file.media_actor_id, file_id=actor_file.media_file_id) + results.append(response) + return results + @router.put("/files/{file_id}", response_model=MediaFileResponse) def update_file(file_id: AnyStr, db: SessionDep, info: MediaFileResponse) -> MediaFileResponse: mediaFile = db.get(MediaFile, file_id) @@ -55,7 +94,11 @@ def update_file(file_id: AnyStr, db: SessionDep, info: MediaFileResponse) -> Med set_file(info, mediaFile) db.add(mediaFile) db.commit() - return info + mediafile = db.get(MediaFile, file_id) + if not mediafile: + raise HTTPException(status_code=404, detail="MediaFile could not be updated") + response = get_file_details(mediafile) + return response @router.post("/files", status_code=status.HTTP_201_CREATED) diff --git a/kontor-api/src/db/models/base.py b/kontor-api/src/db/models/base.py index 74b3ece..79755ff 100644 --- a/kontor-api/src/db/models/base.py +++ b/kontor-api/src/db/models/base.py @@ -21,10 +21,10 @@ class BaseMixin: class BaseVideoMixin: - cloud_link = Column(String) - file_name = Column(String) + cloud_link = Column(String, nullable=True) + file_name = Column(String, nullable=True) path = Column(String) review = Column(Boolean) title = Column(String) - url = Column(String, unique=True) + url = Column(String, nullable=True) should_download = Column(Boolean) diff --git a/kontor-api/src/db/models/media.py b/kontor-api/src/db/models/media.py index ae360b3..fe33264 100644 --- a/kontor-api/src/db/models/media.py +++ b/kontor-api/src/db/models/media.py @@ -71,7 +71,7 @@ class MediaFile(Base, BaseMixin, BaseVideoMixin): class MediaActor(Base, BaseMixin): __tablename__ = 'media_actor' name = Column(String) - url = Column(String, unique=True) + url = Column(String, unique=True, nullable=True) media_actor_files = relationship("MediaActorFile") @@ -82,6 +82,11 @@ class MediaActorFile(Base, BaseMixin): media_file_id = Column(String, ForeignKey("media_file.id"), nullable=True) media_file = relationship("MediaFile", back_populates="media_actor_files") + def __repr__(self): + return f'MediaActorFile({self.id} {self.media_actor_id} {self.media_file_id})' + + def __str__(self) -> str: + return f'{self.id} {self.media_actor_id} {self.media_file_id}' class MediaArticle(Base, BaseMixin): __tablename__ = 'media_article' diff --git a/kontor-api/src/db/repository/media.py b/kontor-api/src/db/repository/media.py index ed665ae..61cc167 100644 --- a/kontor-api/src/db/repository/media.py +++ b/kontor-api/src/db/repository/media.py @@ -3,7 +3,7 @@ from typing import AnyStr import uuid from datetime import datetime from src.core.log_conf import logger -from src.db.models.media import MediaFile, MediaVideo +from src.db.models.media import MediaActorFile, MediaFile, MediaVideo from src.webapps.media.forms import AddLinkForm @@ -38,3 +38,16 @@ def create_new_mediafile(link: AnyStr, db: Session) -> MediaFile: logger.info(f"created {media_file}") return media_file +def create_new_mediaactorfile(db: Session, actor_id: AnyStr, file_id: AnyStr) -> MediaActorFile: + logger.info(f"create MediaActorFile with actor {actor_id} and file {file_id}") + media_actor_file: MediaActorFile = MediaActorFile() + media_actor_file.id = str(uuid.uuid4()) + media_actor_file.created_date = datetime.now() + media_actor_file.last_modified_date = datetime.now() + media_actor_file.version = 0 + media_actor_file.media_actor_id = actor_id + media_actor_file.media_file_id = file_id + db.add(media_actor_file) + db.commit() + db.refresh(media_actor_file) + return media_actor_file diff --git a/kontor-api/src/schema/media/actor.py b/kontor-api/src/schema/media/actor.py new file mode 100644 index 0000000..c800964 --- /dev/null +++ b/kontor-api/src/schema/media/actor.py @@ -0,0 +1,10 @@ +from datetime import datetime + +from src.db.models.media import MediaActor +from pydantic import BaseModel + + +class MediaActorResponse(BaseModel): + id: str + name: str + url: str diff --git a/kontor-api/src/schema/media/actorfile.py b/kontor-api/src/schema/media/actorfile.py new file mode 100644 index 0000000..6bdfd2d --- /dev/null +++ b/kontor-api/src/schema/media/actorfile.py @@ -0,0 +1,10 @@ +from datetime import datetime + +from src.db.models.media import MediaFile +from pydantic import BaseModel + + +class MediaActorFileResponse(BaseModel): + id: str + file_id: str + actor_id: str diff --git a/kontor-api/src/schema/media/file.py b/kontor-api/src/schema/media/file.py index ba4afb5..3cccccc 100644 --- a/kontor-api/src/schema/media/file.py +++ b/kontor-api/src/schema/media/file.py @@ -9,14 +9,14 @@ class MediaFileResponse(BaseModel): title: str | None = None file_name: str | None = None cloud_link: str | None = None - url: str + url: str | None = None review: bool = False should_download: bool = False class Link(BaseModel): url: str -def get_file_details(mediafile: MediaFile) -> MediaFileResponse | None: +def get_file_details(mediafile: MediaFile) -> MediaFileResponse: response = MediaFileResponse(id=mediafile.id, title=mediafile.title, file_name=mediafile.file_name, diff --git a/kontor-scripts/db/models/base.py b/kontor-scripts/db/models/base.py index 84d00ef..5c628e7 100644 --- a/kontor-scripts/db/models/base.py +++ b/kontor-scripts/db/models/base.py @@ -21,10 +21,10 @@ class BaseMixin: class BaseVideoMixin: - cloud_link = Column(String) - file_name = Column(String) + cloud_link = Column(String, nullable=True) + file_name = Column(String, nullable=True) path = Column(String) review = Column(Boolean) title = Column(String) - url = Column(String, unique=True) + url = Column(String, nullable=True) should_download = Column(Boolean) diff --git a/kontor-scripts/find_links.py b/kontor-scripts/find_links.py index 70d667a..c3a83e1 100644 --- a/kontor-scripts/find_links.py +++ b/kontor-scripts/find_links.py @@ -3,25 +3,46 @@ download files with URLs from DB """ import logging.config import requests -import yaml +import re from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter -from pathlib import Path from bs4 import BeautifulSoup -from platformdirs import PlatformDirs parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter) parser.add_argument('--verbose', '-v', action='count', default=0) -parser.add_argument('--config', '-c', default='kontor-docker') +parser.add_argument('--all', '-a', action='store_true') args = parser.parse_args() -def get_logger(level: int, config: str): - dirs = PlatformDirs(config) - logging_config = Path(dirs.user_config_dir, 'logging-config.yaml') - with open(logging_config, 'rt') as f: - configDict = yaml.safe_load(f.read()) - logging.config.dictConfig(configDict) - logger = logging.getLogger('development') +def get_logger(level: int) -> logging.Logger: + logging.config.dictConfig({ + 'version': 1, + 'disable_existing_loggers': False, + 'formatters': { + 'simple': { + 'format': '[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s', + 'datefmt': '%Y-%m-%d %H:%M:%S', + }, + }, + 'handlers': { + 'console': { + 'class': logging.StreamHandler, + 'level': logging.DEBUG, + 'formatter': 'simple', + 'stream': 'ext://sys.stdout' + }, + }, + 'loggers': { + 'urllib3.connectionpool': { + 'level': 'WARNING', + 'propagate': False, + }, + 'root': { + 'level': 'DEBUG', + 'handlers': ['console'], + }, + }, + }) + logger = logging.getLogger(__file__) if level is not None: match level: case 0: @@ -32,35 +53,78 @@ def get_logger(level: int, config: str): logger.setLevel(logging.CRITICAL) return logger +def update_file(log: logging.Logger, media_file): + update = requests.put(f"http://127.0.0.1:8800/api/media/files/{media_file['id']}", json=media_file) + log.info(f"update status: {update.status_code}") + log.info(f"update result: {update.json()}") + +def get_actor_links(log: logging.Logger, media_file_url: str) -> list: + try: + r = requests.get(media_file_url) + soup = BeautifulSoup(r.content, "html.parser") + error404 = soup.css.select_one('.error404-title') + if error404 and error404.get_text() == "Video nicht gefunden": + log.info(f"{error404.get_text()}") + item['url'] = None + item['review'] = False + update_file(log, item) + return [] + anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")}) + actor_links = [] + for anchor in anchors: + link_url = anchor.get('href') + if link_url.endswith('all/countries'): + continue + actor_links.append(link_url) + log.info(f"links({len(actor_links)}): {actor_links}") + return actor_links + except Exception as error: + log.info(f"something went wrong: {error}") + return [] + if __name__ == '__main__': - log = get_logger(args.verbose, args.config) - log.info('kontor.update_titles started') - response = requests.get("http://127.0.0.1:8800/api/media/files?review=true") + log = get_logger(args.verbose) + log.info('kontor.find_links started') + log.info('get all actors') + response = requests.get("http://127.0.0.1:8800/api/media/actors") + data = response.json() + actors = {} + for item in data: + actor = {} + actor['id'] = item['id'] + actor['name'] = item['name'] + actor['url'] = item['url'] + actors[item['url']] = actor + log.debug(f'all actors: {actors}') + files_url = "" + if args.all: + files_url= "http://127.0.0.1:8800/api/media/files" + else: + files_url = "http://127.0.0.1:8800/api/media/files?review=true" + response = requests.get(files_url) log.info(f"Status: {response.status_code}") data = response.json() log.info(f"data: {len(data)}") for item in data: link = item['url'] + if not link: + continue + if str(link) == "None": + continue log.info(f"{item['id']} - {str(link)}") - try: - r = requests.get(link) - soup = BeautifulSoup(r.content, "html.parser") - title = soup.title.string - anchors = soup.find_all('a') - for anchor in anchors: - if anchor.has_attr('href'): - link_url = anchor['href'] - if link_url and link_url.__contains__('pornstars/'): - log.info(link_url) - item['title'] = title - item['review'] = False - except Exception as error: - log.info(f"something went wrong: {error} {anchor}") - item['title'] = None - item['review'] = True - #update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item) - #log.info(f"update status: {update.status_code}") - #log.info(f"update result: {update.json()}") - log.info('kontor.update_titles finished') - + actor_links = get_actor_links(log, link) + actor_list = [] + for actor_link in actor_links: + if actor_link in actors: + log.info(f"found actor with id: {actors[actor_link]['id']}") + actor_list.append(actors[actor_link]) + actor_response = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}/actors", json=actor_list) + actor_data = actor_response.json() + log.info(f"found {len(actor_data)} actors") + log.info(f"found actors: {actor_data}") + item['review'] = False + update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item) + log.info(f"update status: {update.status_code}") + log.info(f"update result: {update.json()}") + log.info('kontor.find_links finished') diff --git a/kontor-scripts/update_title.py b/kontor-scripts/update_title.py index 7031180..c4467af 100644 --- a/kontor-scripts/update_title.py +++ b/kontor-scripts/update_title.py @@ -43,9 +43,14 @@ if __name__ == '__main__': for item in data: link = item['url'] log.info(f"{item['id']} - {str(link)}") + if not link: + continue try: r = requests.get(link) soup = BeautifulSoup(r.content, "html.parser") + title_tag = soup.find('title') + if title_tag: + title= title_tag.get_text() title = soup.title.string item['title'] = title item['review'] = False diff --git a/kontor-spring/src/main/java/de/thpeetz/kontor/media/data/MediaFile.java b/kontor-spring/src/main/java/de/thpeetz/kontor/media/data/MediaFile.java index db1b826..2193690 100644 --- a/kontor-spring/src/main/java/de/thpeetz/kontor/media/data/MediaFile.java +++ b/kontor-spring/src/main/java/de/thpeetz/kontor/media/data/MediaFile.java @@ -16,7 +16,6 @@ import java.util.List; @Setter @EqualsAndHashCode(callSuper = false) @Entity -@Table(uniqueConstraints = { @UniqueConstraint(columnNames = { "url" }) }) public class MediaFile extends AbstractEntity { @Nullable