funkwhale/api/funkwhale_api/contrib/archivedl/tasks.py

184 lines
5.6 KiB
Python

import asyncio
import hashlib
import logging
import os
import tempfile
import time
import urllib.parse
from datetime import timedelta
import requests
from django.core.files import File
from django.utils import timezone
from funkwhale_api.federation import actors
from funkwhale_api.music import models, utils
from funkwhale_api.taskapp import celery
logger = logging.getLogger(__name__)
def check_existing_download_task(track):
if models.Upload.objects.filter(
track=track,
import_status__in=["pending", "finished"],
).exists():
logger.info("Upload for this track already exist or is pending. Stopping task.")
return
def check_last_third_party_queries(track, count):
# 15 per minutes according to their doc = one each 4 seconds
time_threshold = timezone.now() - timedelta(seconds=5)
if models.Upload.objects.filter(
track=track,
third_party_provider="archive-dl",
import_status__in=["pending", "finished"],
creation_date__gte=time_threshold,
).exists():
logger.info(
"Last archive.org query was too recent. Trying to wait 10 seconds..."
)
time.sleep(10)
count += 1
if count > 3:
logger.info(
"Probably too many archivedl tasks are queue, stopping this task"
)
return
check_last_third_party_queries(track, count)
def create_upload(url, track, files_data):
mimetype = f"audio/{files_data.get('format', 'unknown')}"
duration = files_data.get("mtime", 0)
filesize = files_data.get("size", 0)
bitrate = files_data.get("bitrate", 0)
service_library = models.Library.objects.create(
privacy_level="everyone",
actor=actors.get_service_actor(),
)
return models.Upload.objects.create(
mimetype=mimetype,
source=url,
third_party_provider="archive-dl",
creation_date=timezone.now(),
track=track,
duration=duration,
size=filesize,
bitrate=bitrate,
library=service_library,
from_activity=None,
import_status="pending",
)
@celery.app.task(name="archivedl.archive_download")
@celery.require_instance(models.Track.objects.select_related(), "track")
def archive_download(track, conf):
check_existing_download_task(track)
check_last_third_party_queries(track, 0)
artist_name = utils.get_artist_credit_string(track)
query = f"mediatype:audio AND title:{track.title} AND creator:{artist_name}"
with requests.Session() as session:
url = get_search_url(query, page_size=1, page=1)
page_data = fetch_json(url, session)
for obj in page_data["response"]["docs"]:
logger.info(f"launching download item for {str(obj)}")
download_item(
item_data=obj,
session=session,
allowed_extensions=utils.SUPPORTED_EXTENSIONS,
track=track,
)
def fetch_json(url, session):
logger.info(f"Fetching {url}...")
with session.get(url) as response:
return response.json()
def download_item(
item_data,
session,
allowed_extensions,
track,
):
files_data = get_files_data(item_data["identifier"], session)
to_download = list(
filter_files(
files_data["result"],
allowed_extensions=allowed_extensions,
)
)
url = f"https://archive.org/download/{item_data['identifier']}/{to_download[0]['name']}"
upload = create_upload(url, track, to_download[0])
try:
with tempfile.TemporaryDirectory() as temp_dir:
path = os.path.join(temp_dir, to_download[0]["name"])
download_file(
path,
url=url,
session=session,
checksum=to_download[0]["sha1"],
upload=upload,
to_download=to_download,
)
logger.info(f"Finished to download item {item_data['identifier']}...")
except Exception as e:
upload.delete()
raise e
def check_integrity(path, expected_checksum):
with open(path, mode="rb") as f:
hash = hashlib.sha1()
hash.update(f.read())
return expected_checksum == hash.hexdigest()
def get_files_data(identifier, session):
url = f"https://archive.org/metadata/{identifier}/files"
logger.info(f"Fetching files data at {url}...")
with session.get(url) as response:
return response.json()
def download_file(path, url, session, checksum, upload, to_download):
if os.path.exists(path) and check_integrity(path, checksum):
logger.info(f"Skipping already downloaded file at {path}")
return
logger.info(f"Downloading file {url}...")
with open(path, mode="wb") as f:
try:
with session.get(url) as response:
f.write(response.content)
except asyncio.TimeoutError as e:
logger.error(f"Timeout error while downloading {url}: {e}")
with open(path, "rb") as f:
upload.audio_file.save(f"{to_download['name']}", File(f))
upload.import_status = "finished"
upload.url = url
upload.save()
return upload
def filter_files(files, allowed_extensions):
for f in files:
if allowed_extensions:
extension = os.path.splitext(f["name"])[-1][1:]
if extension not in allowed_extensions:
continue
yield f
def get_search_url(query, page_size, page):
q = urllib.parse.urlencode({"q": query})
return f"https://archive.org/advancedsearch.php?{q}&sort[]=addeddate+desc&rows={page_size}&page={page}&output=json"