Files
kontor/kontor-scripts/find_links.py
T
2025-09-02 20:39:21 +02:00

140 lines
5.0 KiB
Python

"""
download files with URLs from DB
"""
import logging.config
import requests
import re
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from bs4 import BeautifulSoup
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', '-v', action='count', default=0)
parser.add_argument('--all', '-a', action='store_true')
args = parser.parse_args()
def get_logger(level: int) -> logging.Logger:
logging.config.dictConfig({
'version': 1,
'disable_existing_loggers': False,
'formatters': {
'simple': {
'format': '[%(asctime)s] {%(filename)s:%(lineno)d} %(levelname)s - %(message)s',
'datefmt': '%Y-%m-%d %H:%M:%S',
},
},
'handlers': {
'console': {
'class': logging.StreamHandler,
'level': logging.DEBUG,
'formatter': 'simple',
'stream': 'ext://sys.stdout'
},
},
'loggers': {
'urllib3.connectionpool': {
'level': 'WARNING',
'propagate': False,
},
'root': {
'level': 'DEBUG',
'handlers': ['console'],
},
},
})
logger = logging.getLogger(__file__)
if level is not None:
match level:
case 0:
logger.setLevel(logging.WARNING)
case 1:
logger.setLevel(logging.INFO)
case 2:
logger.setLevel(logging.DEBUG)
case _:
logger.setLevel(logging.CRITICAL)
return logger
def update_file(log: logging.Logger, media_file):
update = requests.put(f"http://127.0.0.1:8800/api/media/files/{media_file['id']}", json=media_file)
log.debug(f"update status: {update.status_code}")
log.debug(f"update result: {update.json()}")
def get_actor_links(log: logging.Logger, media_file_url: str) -> list:
try:
r = requests.get(media_file_url)
soup = BeautifulSoup(r.content, "html.parser")
error404 = soup.css.select_one('.error404-title')
if error404 and error404.get_text() == "Video nicht gefunden":
log.warning(f"{error404.get_text()}")
item['url'] = None
item['review'] = False
update_file(log, item)
return []
anchors = soup.find_all('a', attrs={'href': re.compile("^https://.*pornstars/.*")})
actor_links = []
for anchor in anchors:
link_url = anchor.get('href')
if link_url.endswith('all/countries'):
continue
if link_url in actor_links:
continue
actor_links.append(link_url)
log.debug(f"links({len(actor_links)}): {actor_links}")
return actor_links
except Exception as error:
log.warning(f"something went wrong: {error}")
return []
if __name__ == '__main__':
log = get_logger(args.verbose)
log.warning('kontor.find_links started')
log.debug('get all actors')
response = requests.get("http://127.0.0.1:8800/api/media/actors")
data = response.json()
actors = {}
for item in data:
actor = {}
actor['id'] = item['id']
actor['name'] = item['name']
actor['url'] = item['url']
actors[item['url']] = actor
log.debug(f'all actors: {actors}')
files_url = ""
if args.all:
files_url= "http://127.0.0.1:8800/api/media/files"
else:
files_url = "http://127.0.0.1:8800/api/media/files?review=true"
response = requests.get(files_url)
log.debug(f"Status: {response.status_code}")
data = response.json()
log.debug(f"data: {len(data)}")
for item in data:
link = item['url']
if not link:
continue
if str(link) == "None":
continue
log.warning(f"{item['id']} - {str(link)}")
actor_links = get_actor_links(log, link)
actor_list = []
for actor_link in actor_links:
if actor_link in actors:
log.debug(f"found actor with id: {actors[actor_link]['id']}")
actor_list.append(actors[actor_link])
actor_response = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}/actors", json=actor_list)
actor_data = actor_response.json()
persisted_actor_links: int = len(actor_data)
found_actor_links: int = len(actor_links)
if persisted_actor_links != found_actor_links:
log.warning(f"{persisted_actor_links} links persisted, but {found_actor_links} links are available")
log.info(f"found actors: {actor_links}")
log.debug(f"found {persisted_actor_links} actors")
log.debug(f"found actors: {actor_data}")
item['review'] = False
update = requests.put(f"http://127.0.0.1:8800/api/media/files/{item['id']}", json=item)
log.debug(f"update status: {update.status_code}")
log.debug(f"update result: {update.json()}")
log.warning('kontor.find_links finished')