extends add_links to get title and actors

This commit is contained in:
2025-12-29 02:51:03 +01:00
parent 66cbd4b2d1
commit 41733ec030
2 changed files with 72 additions and 15 deletions
+71 -14
View File
@@ -2,7 +2,10 @@
read file with links and store it in DB
"""
import logging.config
import re
from typing import List
from bs4 import BeautifulSoup
import requests
import yaml
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from pathlib import Path
@@ -11,11 +14,7 @@ from pathlib import Path
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, Session
from db.models.base import Base
from db.models import registry
from psycopg2.errors import NotNullViolation
from config import get_logger
import os
import json
from db.models.media import MediaFile
@@ -24,6 +23,7 @@ parser.add_argument('--file', '-f', help='file with links', default='~/.sync/med
parser.add_argument('--video', help='store Url as VideoFile', action="store_true")
parser.add_argument('--config', '-c', default='kontor-docker')
parser.add_argument('--verbose', '-v', action='count', default=0)
parser.add_argument('--dry-run', '-m', help='excute script without storing', action="store_true")
args = parser.parse_args()
DB_USER: str = os.getenv("DB_USER", "kontor")
@@ -33,6 +33,31 @@ DB_PORT: int = int(os.getenv("DB_PORT", 5432))
DB_DBNAME: str = os.getenv("DB_DBNAME", "kontor")
DATABASE_URL: str = f"postgresql://{DB_USER}:{DB_PASSWORD}@{DB_SERVER}:{DB_PORT}/{DB_DBNAME}"
def get_logger(level, config: str):
dirs = PlatformDirs(config)
logging_config = Path(dirs.user_config_dir, 'logging-config.yaml')
with open(logging_config, 'rt') as f:
log_config = yaml.safe_load(f.read())
logging.config.dictConfig(log_config)
logger = logging.getLogger('development')
if level is not None:
match level:
case 0:
logger.setLevel(logging.CRITICAL)
case 1:
logger.setLevel(logging.INFO)
case 2:
logger.setLevel(logging.DEBUG)
case _:
logger.setLevel(logging.INFO)
return logger
def get_session() -> Session:
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(bind=engine, checkfirst=True)
SessionLocal = sessionmaker(bind=engine)
return SessionLocal()
def load_data(filename: str, log) -> List[str]:
links: List[str] = []
log.debug("load_data")
@@ -47,26 +72,58 @@ def load_data(filename: str, log) -> List[str]:
links.append(line.rstrip())
return links
def get_meta_info(media_file: MediaFile, log):
try:
r = requests.get(media_file.url)
soup = BeautifulSoup(r.content, "html.parser")
error404 = soup.css.select_one('.error404-title')
if error404 and error404.get_text() == "Video nicht gefunden":
log.warning(f"{error404.get_text()}")
media_file.url = None
media_file.review = False
return
title_tag = soup.find('title')
if title_tag:
media_file.title = title_tag.get_text()
media_file.review = False
anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")})
actor_links = []
for anchor in anchors:
link_url = str(anchor.get("href")) # type: ignore
if link_url.endswith('all/countries'):
continue
if link_url in actor_links:
continue
actor_links.append(link_url)
log.info(f"links({len(actor_links)}): {actor_links}")
except Exception as error:
log.info(f"something went wrong: {error}")
media_file.title = None
media_file.review = True
log.info(f"update MediaFile with MetaInfos to {repr(media_file)}")
if __name__ == '__main__':
logger = get_logger(args.verbose, "kontor")
logger.info('kontor.add_links started')
engine = create_engine(DATABASE_URL)
Base.metadata.create_all(bind=engine, checkfirst=True)
SessionLocal = sessionmaker(bind=engine)
with SessionLocal() as db:
session = get_session()
with session as db:
links = load_data(args.file, logger)
for link in links:
logger.info(f"process {link}")
logger.debug(f"process {link}")
media_files = db.query(MediaFile).filter(MediaFile.url == link).all()
if len(media_files) == 0:
logger.info("no entry is found")
logger.info(f"MediaFile for link {link} not found")
media_file = MediaFile()
media_file.url = link
media_file.review = True
media_file.should_download = True
db.add(media_file)
db.commit()
# else:
# logger.info("entry is found")
get_meta_info(media_file, logger)
if not args.dry_run:
db.add(media_file)
db.commit()
db.refresh(media_file)
else:
for media_file in media_files:
logger.debug(f"MediaFile with {media_file.id} is found")
logger.info('kontor.add_link finished')
+1 -1
View File
@@ -16,7 +16,7 @@ class MediaFile(Base, BaseMixin, BaseVideoMixin):
media_actor_files = relationship("MediaActorFile")
def __repr__(self):
return f'MediaFile({self.id} {self.title} {self.title})'
return f'MediaFile(\n\tID: {self.id}\n\tTitle: {self.title}\n\tURL: {self.url}\n\tReview: {self.review}\n\tDownload: {self.should_download})'
def __str__(self):
return f'{self.title}({self.id})'