refactor scripts to work wit api

This commit is contained in:
Thomas Peetz
2025-04-16 05:08:59 +02:00
parent 98e3d91edd
commit 4a61d6a727
8 changed files with 262 additions and 191 deletions
+48 -61
View File
@@ -1,12 +1,13 @@
"""
Checks the database kontor
"""
from dataclasses import dataclass
from enum import Enum, auto
import mariadb
from pathlib import Path
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
import requests
from config import get_logger, get_database_cursors
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
@@ -25,71 +26,63 @@ class StatusType(Enum):
CLOUD_LINK = auto()
CLOUD_LINK_ID = auto()
@dataclass
class FileStatus:
id: str
status_type: StatusType
def get_status_of_file(found_file, cursor):
status = StatusType.UNKNOWN
file_id = ''
try:
cursor.execute(f'SELECT id, cloud_link FROM media_file WHERE file_name="{found_file.name}"')
rows = cursor.fetchall()
if len(rows) == 1:
status = StatusType.FILE_NAME
file_id = rows[0][0]
except mariadb.Error as error:
logger.debug(f'select failed with {error}')
try:
cursor.execute(f'SELECT id FROM media_file WHERE id="{found_file.stem}"')
rows = cursor.fetchall()
if len(rows) == 1:
status = StatusType.FILE_ID
file_id = rows[0][0]
if len(rows) > 1:
status = StatusType.DUPLICATE
for row in rows:
logger.info(f"found {row[0]} with {found_file}")
except mariadb.Error as error:
logger.debug(f'select failed with {error}')
try:
cursor.execute(f'SELECT id FROM media_file WHERE cloud_link LIKE "%{found_file.stem}%"')
rows = cursor.fetchall()
if len(rows) == 1:
file_id = rows[0][0]
if rows[0][0] == found_file.stem:
status = StatusType.CLOUD_LINK_ID
else:
status = StatusType.CLOUD_LINK
except mariadb.Error as error:
logger.debug(f'select failed with {error}')
return status, file_id
def rename_files_to_id(media_dir, conn, dry_run):
def get_status_of_file(found_file: Path, log) -> FileStatus:
status = FileStatus()
response = requests.post("http://127.0.0.1:8800/media/search")
log.info(f"Status: {response.status_code}")
data = response.json()
status.import(data)
if len(data) == 1:
status = StatusType.FILE_NAME
status.id = data['id']
response = requests.get(f"http://127.0.0.1:8800/media/files/{found_file.stem}")
log.info(f"Status: {response.status_code}")
data = response.json()
if len(data) == 1:
status = StatusType.FILE_ID
file_id = data['id']
response = requests.get(f"http://127.0.0.1:8800/media/files?cloud_link=true")
log.info(f"Status: {response.status_code}")
data = response.json()
if len(data) == 1:
status = StatusType.CLOUD_LINK_ID
file_id = data['id']
return status
def rename_files_to_id(media_dir, dry_run, log):
media_path = Path(media_dir)
cursor = conn.cursor()
for file in media_path.iterdir():
logger.debug('found file: {}'.format(file.name))
(status, file_id) = get_status_of_file(file, cursor)
new_file_path = file.with_name(f"{file_id}{file.suffix}")
match status:
log.debug('found file: {}'.format(file.name))
status = get_status_of_file(file, log)
new_file_path = file.with_name(f"{status.id}{file.suffix}")
file_id = status.id
match status.status_type:
case StatusType.FILE_NAME:
logger.info(f'status of {file.name} is file_name')
log.info(f'status of {file.name} is file_name')
rename_file(file, new_file_path, dry_run)
update_cloud_link(file_id, new_file_path, conn, dry_run)
update_cloud_link(file_id, new_file_path, dry_run)
case StatusType.FILE_ID:
logger.info(f'status of {file.name} is file_id')
update_cloud_link(file_id, new_file_path, conn, dry_run)
log.info(f'status of {file.name} is file_id')
update_cloud_link(file_id, new_file_path, dry_run)
case StatusType.CLOUD_LINK:
logger.info(f'status of {file.name} is cloud_link')
log.info(f'status of {file.name} is cloud_link')
rename_file(file, new_file_path, dry_run)
update_cloud_link(file_id, new_file_path, conn, dry_run)
update_cloud_link(file_id, new_file_path, dry_run)
case StatusType.CLOUD_LINK_ID:
logger.debug(f'status of {file.name} is cloud_link_id')
update_cloud_link(file_id, new_file_path, conn, dry_run)
log.debug(f'status of {file.name} is cloud_link_id')
update_cloud_link(file_id, new_file_path, dry_run)
case StatusType.DUPLICATE:
logger.info(f'status of {file.name} is duplicate')
log.info(f'status of {file.name} is duplicate')
case StatusType.UNKNOWN:
logger.info(f'status of {file.name} is unknown')
log.info(f'status of {file.name} is unknown')
case _:
logger.info(f'status of {file.name} is not defined')
log.info(f'status of {file.name} is not defined')
def rename_file(current_file, new_file_path, dry_run):
if dry_run:
@@ -118,14 +111,8 @@ def reset_cloud_link(conn, dry_run):
if __name__ == '__main__':
logger = get_logger(args.verbose, args.config)
logger.info("kontor.check_kontor started")
_, mariadb_conn = get_database_cursors(logger, args.config)
mariadb_cursor = mariadb_conn.cursor()
if args.reset_cloud_link:
reset_cloud_link(mariadb_conn, args.dry_run)
link_list = []
data_dir = args.dir
logger.info("kontor.check_kontor.rename_files_to_id")
rename_files_to_id(data_dir, mariadb_conn, args.dry_run)
rename_files_to_id(args.dir, args.dry_run, logger)
#logger.info("kontor.check_kontor.update_cloud_link_with_found_files")
#update_cloud_link_with_found_files(data_dir, mariadb_conn, args.dry_run)
#logger.info("kontor.check_kontor.get_ids_from_column_cloud_link")
@@ -133,5 +120,5 @@ if __name__ == '__main__':
#logger.info('found {} ids in column cloud_link'.format(len(link_list)))
#logger.info("kontor.check_kontor.checking_ids_from_cloud_link")
#checking_ids_from_cloud_link(link_list, mariadb_cursor)
mariadb_conn.close()
logger.info("kontor.check_kontor finished")
+101 -37
View File
@@ -1,13 +1,15 @@
"""
download files with URLs from DB
"""
import re
import subprocess
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from platformdirs import PlatformDirs
from datetime import datetime
from enum import Enum, auto
from pathlib import Path
import yaml
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from schema import Base, KontorDB, MediaFile
from uuid import UUID
import requests
from config import get_logger
@@ -17,42 +19,104 @@ parser.add_argument('--config', '-c', default='kontor-docker')
parser.add_argument('--dir', '-d', default='/data/media')
parser.add_argument('--tool', '-t', default='yt-dlp')
parser.add_argument('--dry-run', '-m', action='store_true')
parser.add_argument('--rename', '-r', action='store_true')
args = parser.parse_args()
class FileStatus(Enum):
DOWNLOADED = auto()
RENAMED = auto()
UNKNOWN = auto()
def download_file(url: str, file_info: dict, download_dir: str = "/data/media", dl_tool: str = "yt-dlp") -> dict:
print(f"download file for {url} to {download_dir}")
result = subprocess.run([dl_tool, url], cwd=download_dir, capture_output=True, text=True)
if result.returncode == 0:
output = result.stdout
output = re.sub(' +', ' ', output)
lines_list = output.splitlines()
file_name = __parse_output__(lines_list)
if file_name is None:
file_info['review'] = True
file_info['should_download'] = True
file_info['file_name'] = None
else:
download_file_name = Path(download_dir, file_name)
file_info['should_download'] = False
file_info['file_name'] = download_file_name.name
file_info['cloud_link'] = str(download_file_name.absolute())
file_info['last_modified_date'] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
return file_info
def __parse_output__(lines_list: list[str]) -> str | None:
file_name = None
for line in lines_list:
if 'has already been downloaded' in line:
end_len = len(' has already been downloaded')
file_name = line[11:-end_len]
if 'Destination' in line:
line_len = len(line)
start_len = len('[download] Destination: ')
file_len = line_len - start_len
file_name = line[-file_len:]
return file_name
def is_file_downloaded(item: dict, dir: Path) -> FileStatus:
file_name_as_title = f"{item['file_name']}"
file_title = Path(dir, file_name_as_title, ".mp4")
if file_title.exists():
log.info(f"{file_name_as_title} has been downloaded")
item['should_download'] = 0
return FileStatus.DOWNLOADED
file_name_as_id = f"{item['id']}"
file_with_id_as_name = Path(dir, file_name_as_id, ".mp4")
if file_with_id_as_name.exists():
log.info(f"{file_with_id_as_name} has been downloaded and renamed")
item['cloud_link'] = file_with_id_as_name
item['should_download'] = 0
return FileStatus.RENAMED
log.info("could not find file - start download")
return FileStatus.UNKNOWN
def update_status(item_id: UUID, file_info: dict):
update = requests.put(f"http://127.0.0.1:8800/media/files/{item_id}", json=file_info)
log.info(f"update status: {update.status_code}")
log.info(f"update result: {update.json()}")
def rename_file(file_info: dict):
item_id = file_info['id']
file = Path(args.dir, file_info['file_name'])
new_file_path = file.with_name(f"{item_id}{file.suffix}")
log.info(f"rename {file} to {new_file_path}")
file.rename(Path(new_file_path))
file_info['cloud_link'] = str(new_file_path)
if __name__ == '__main__':
log = get_logger(args.verbose, args.config)
log.info('kontor.download started')
dirs = PlatformDirs(args.config)
database_config = Path(dirs.user_config_dir, 'database-config.yaml')
with open(database_config, 'rt') as f:
db_config = yaml.safe_load(f.read())
print(db_config)
connect_string = ('mariadb+mariadbconnector://{}:{}@{}:{}/{}'.format(
db_config['mariadb']['user'],
db_config['mariadb']['password'],
db_config['mariadb']['host'],
db_config['mariadb']['port'],
db_config['mariadb']['database']
))
engine = create_engine(connect_string)
Base.metadata.create_all(bind=engine, checkfirst=True)
__session__ = sessionmaker(bind=engine)
_filter = {'should_download': 1}
with __session__() as session:
files = session.query(MediaFile).filter_by(**_filter).all()
log.info("found %d entries", len(files))
files2 = session.query(MediaFile).filter(MediaFile.should_download == 1).all()
log.info("found %d entries", len(files2))
for mediafile in files2:
mediafile.download_file(download_dir=args.dir, dl_tool="yt-dlp")
log.info("Datei {} erfolgreich heruntergeladen".format(mediafile.file_name))
if args.rename:
current_file = Path(mediafile.file_name)
new_file_path = current_file.with_name(f"{mediafile.id}{current_file.suffix}")
current_file.rename(Path(new_file_path))
mediafile.cloud_link = new_file_path
session.add(mediafile)
session.commit()
response = requests.get("http://127.0.0.1:8800/media/files?download=true")
log.info(f"Status: {response.status_code}")
data = response.json()
log.info(f"data: {len(data)}")
for item in data:
link = item['url']
file_id = item['id']
log.info(f"{file_id} - {link}")
download_status: FileStatus = is_file_downloaded(item, args.dir)
match download_status:
case FileStatus.DOWNLOADED:
rename_file(item)
update_status(file_id, item)
case FileStatus.RENAMED:
log.info("update status")
update_status(file_id, item)
case FileStatus.UNKNOWN:
download_file(link, item)
rename_file(item)
log.info(f'{item}')
update_status(file_id, item)
log.info('kontor.download finished')
+23 -30
View File
@@ -2,14 +2,14 @@
download files with URLs from DB
"""
import logging.config
import requests
import yaml
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
from pathlib import Path
from platformdirs import PlatformDirs
from sqlalchemy import create_engine, select
from sqlalchemy.orm import sessionmaker
from schema import MediaFile, Base
from bs4 import BeautifulSoup
from platformdirs import PlatformDirs
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--verbose', '-v', action='count', default=0)
@@ -37,30 +37,23 @@ def get_logger(level: int, config: str):
if __name__ == '__main__':
log = get_logger(args.verbose, args.config)
log.info('kontor.update_titles started')
dirs = PlatformDirs(args.config)
database_config = Path(dirs.user_config_dir, 'database-config.yaml')
with open(database_config, 'rt') as f:
db_config = yaml.safe_load(f.read())
print(db_config)
connect_string = ('mariadb+mariadbconnector://{}:{}@{}:{}/{}'.format(
db_config['mariadb']['user'],
db_config['mariadb']['password'],
db_config['mariadb']['host'],
db_config['mariadb']['port'],
db_config['mariadb']['database']
))
engine = create_engine(connect_string)
Base.metadata.create_all(bind=engine, checkfirst=True)
__session__ = sessionmaker(engine)
_filter = {'review': 1}
with __session__() as session:
files = session.query(MediaFile).filter_by(**_filter).all()
log.info("found %d entries", len(files))
files2 = session.query(MediaFile).filter(MediaFile.review ==1).all
log.info("found %d entries", len(files2))
for mediafile in files:
mediafile.update_title()
session.add(mediafile)
session.commit()
log.info("found %d entries", len(files))
response = requests.get("http://127.0.0.1:8800/media/files?review=true")
log.info(f"Status: {response.status_code}")
data = response.json()
log.info(f"data: {len(data)}")
for item in data:
link = item['url']
log.info(f"{item['id']} - {link}")
try:
r = requests.get(link)
soup = BeautifulSoup(r.content, "html.parser")
title = soup.title.string
item['title'] = title
item['review'] = 0
except:
item['title'] = None
item['review'] = 1
update = requests.put(f"http://127.0.0.1:8800/media/files/{item['id']}", json=item)
log.info(f"update status: {update.status_code}")
log.info(f"update result: {update.json()}")
log.info('kontor.update_titles finished')