WIP:newfeature(backend):fetch musicbrainz metadata from search bar

This commit is contained in:
Petitminion 2025-06-03 03:40:40 +02:00
parent 87e7297fae
commit e1445c5637
6 changed files with 377 additions and 3 deletions

View File

@ -1,3 +1,5 @@
from urllib.parse import urlparse
import requests.exceptions import requests.exceptions
from django.conf import settings from django.conf import settings
from django.db import transaction from django.db import transaction
@ -252,6 +254,13 @@ class FetchViewSet(
if fetch.status == "finished": if fetch.status == "finished":
# a duplicate was returned, no need to fetch again # a duplicate was returned, no need to fetch again
return return
parsed_url = urlparse(fetch.url)
domain = parsed_url.netloc
if domain in fetch.supported_services:
tasks.third_party_fetch(fetch_id=fetch.pk)
fetch.refresh_from_db()
else:
if settings.FEDERATION_SYNCHRONOUS_FETCH: if settings.FEDERATION_SYNCHRONOUS_FETCH:
tasks.fetch(fetch_id=fetch.pk) tasks.fetch(fetch_id=fetch.pk)
fetch.refresh_from_db() fetch.refresh_from_db()

View File

@ -359,6 +359,22 @@ CONTEXTS = [
} }
}, },
}, },
{
"shortId": "MB",
"contextUrl": None,
"documentUrl": "http://musicbrainz.org/ns/mmd-1.0#",
"document": {
"@context": {
"mb": "http://musicbrainz.org/ns/mmd-1.0#",
"schema": "http://schema.org#",
"Recording": "schema:MusicRecording",
"name": "schema:name",
"duration": "schema:duration",
"@id": "@id",
"@type": "@type",
},
},
},
] ]
CONTEXTS_BY_ID = {c["shortId"]: c for c in CONTEXTS} CONTEXTS_BY_ID = {c["shortId"]: c for c in CONTEXTS}
@ -392,3 +408,4 @@ SEC = NS(CONTEXTS_BY_ID["SEC"])
FW = NS(CONTEXTS_BY_ID["FW"]) FW = NS(CONTEXTS_BY_ID["FW"])
SC = NS(CONTEXTS_BY_ID["SC"]) SC = NS(CONTEXTS_BY_ID["SC"])
LITEPUB = NS(CONTEXTS_BY_ID["LITEPUB"]) LITEPUB = NS(CONTEXTS_BY_ID["LITEPUB"])
MB = NS(CONTEXTS_BY_ID["MB"])

View File

@ -18,6 +18,7 @@ from funkwhale_api.common import session
from funkwhale_api.common import utils as common_utils from funkwhale_api.common import utils as common_utils
from funkwhale_api.common import validators as common_validators from funkwhale_api.common import validators as common_validators
from funkwhale_api.music import utils as music_utils from funkwhale_api.music import utils as music_utils
from funkwhale_api.musicbrainz import serializers as musicbrainz_serializers
from . import utils as federation_utils from . import utils as federation_utils
@ -411,8 +412,14 @@ class Fetch(models.Model):
contexts.AS.Organization: [serializers.ActorSerializer], contexts.AS.Organization: [serializers.ActorSerializer],
contexts.AS.Service: [serializers.ActorSerializer], contexts.AS.Service: [serializers.ActorSerializer],
contexts.AS.Application: [serializers.ActorSerializer], contexts.AS.Application: [serializers.ActorSerializer],
# for mb the key must be the api namespace
"recordings": [musicbrainz_serializers.RecordingSerializer],
} }
@property
def supported_services(self):
return ["musicbrainz.org"]
class InboxItem(models.Model): class InboxItem(models.Model):
""" """

View File

@ -2,6 +2,8 @@ import datetime
import json import json
import logging import logging
import os import os
import uuid
from urllib.parse import urlparse
import requests import requests
from django.conf import settings from django.conf import settings
@ -13,6 +15,7 @@ from django.utils import timezone
from dynamic_preferences.registries import global_preferences_registry from dynamic_preferences.registries import global_preferences_registry
from requests.exceptions import RequestException from requests.exceptions import RequestException
from funkwhale_api import musicbrainz
from funkwhale_api.audio import models as audio_models from funkwhale_api.audio import models as audio_models
from funkwhale_api.common import models as common_models from funkwhale_api.common import models as common_models
from funkwhale_api.common import preferences, session from funkwhale_api.common import preferences, session
@ -456,6 +459,137 @@ def fetch(fetch_obj):
) )
def musicbrainz_type_handler(fetch):
url = fetch.url
path_parts = urlparse(url).path.strip("/").split("/")
type_ = path_parts[0] + "s"
mbid = path_parts[1]
try:
uuid.UUID(mbid)
except ValueError:
raise ValueError(f"could no get mbid from url {url}")
return type_, mbid
def musicbrainz_metadata_handler(type_, id):
def replace_hyphens_in_keys(obj):
if isinstance(obj, dict):
return {
k.replace("-", "_"): replace_hyphens_in_keys(v) for k, v in obj.items()
}
elif isinstance(obj, list):
return [replace_hyphens_in_keys(item) for item in obj]
else:
return obj
result = replace_hyphens_in_keys(
getattr(musicbrainz.api, type_).get(
id=id, includes=["tags", "artists", "releases"]
)
)
existing = (
music_models.Track.objects.filter(mbid=id).first()
if music_models.Track.objects.filter(mbid=id).exists()
else None
)
return result, existing
type_and_id_from_third_party = {"musicbrainz.org": musicbrainz_type_handler}
metadata_from_third_party_ = {"musicbrainz.org": musicbrainz_metadata_handler}
@celery.app.task(name="third_party_fetch")
@transaction.atomic
@celery.require_instance(
models.Fetch.objects.filter(status="pending").select_related("actor"),
"fetch_obj",
"fetch_id",
)
def third_party_fetch(fetch_obj):
def error(code, **kwargs):
fetch_obj.status = "errored"
fetch_obj.fetch_date = timezone.now()
fetch_obj.detail = {"error_code": code}
fetch_obj.detail.update(kwargs)
fetch_obj.save(update_fields=["fetch_date", "status", "detail"])
def check_url(url):
if not url.startswith("webfinger://"):
payload, updated = mrf.inbox.apply({"id": url})
if not payload:
return error("blocked", message="Blocked by MRF")
parsed_url = urlparse(url)
service = parsed_url.netloc
if service not in fetch_obj.supported_services:
return error("invalid_url", message=f"Unsupported domain {service}")
return service
url = fetch_obj.url
actor = fetch_obj.actor
service = check_url(url)
try:
type_, id = type_and_id_from_third_party[service](fetch_obj)
logger.debug("Parsed URL %s into type %s and id %s", url, type_, id)
except ValueError as e:
return error("url_parse_error", message=e.message)
try:
result, existing = metadata_from_third_party_[service](type_, id)
logger.debug(
f"Remote answered with {result} and we found {existing} in database"
)
except requests.exceptions.HTTPError as e:
return error(
"http",
status_code=e.response.status_code if e.response else None,
message=e.response.text,
)
except requests.exceptions.Timeout:
return error("timeout")
except requests.exceptions.ConnectionError as e:
return error("connection", message=str(e))
except requests.RequestException as e:
return error("request", message=str(e))
except Exception as e:
return error("unhandled", message=str(e))
try:
serializer_classes = fetch_obj.serializers.get(type_)
except (KeyError, AttributeError):
fetch_obj.status = "skipped"
fetch_obj.fetch_date = timezone.now()
fetch_obj.detail = {"reason": "unhandled_type", "type": type_}
return fetch_obj.save(update_fields=["fetch_date", "status", "detail"])
serializer = None
for serializer_class in serializer_classes:
serializer = serializer_class(
existing, data=result, context={"fetch_actor": actor}
)
if not serializer.is_valid():
continue
else:
break
if serializer.errors:
return error("validation", validation_errors=serializer.errors)
try:
obj = serializer.save()
except Exception as e:
error("save", message=str(e))
raise
fetch_obj.object = obj
fetch_obj.status = "finished"
fetch_obj.fetch_date = timezone.now()
return fetch_obj.save(
update_fields=["fetch_date", "status", "object_id", "object_content_type"]
)
class PreserveSomeDataCollector(Collector): class PreserveSomeDataCollector(Collector):
""" """
We need to delete everything related to an actor. Well Almost everything. We need to delete everything related to an actor. Well Almost everything.

View File

@ -0,0 +1,123 @@
from rest_framework import serializers
from funkwhale_api.tags import models as tags_models
class ArtistSerializer(serializers.Serializer):
"""
Serializer for Musicbrainz artist data.
"""
id = serializers.CharField()
name = serializers.CharField()
def create(self, validated_data):
from funkwhale_api.music.models import Artist
data = {
"name": validated_data["name"],
"mbid": validated_data["id"],
}
artist = Artist.objects.create(**data)
return artist
class ArtistCreditSerializer(serializers.Serializer):
"""
Serializer for Musicbrainz artist data.
"""
name = serializers.CharField()
joinphrase = serializers.CharField(allow_blank=True)
artist = ArtistSerializer()
def create(self, validated_data):
from funkwhale_api.music.models import ArtistCredit
data = {
"credit": validated_data["name"],
"joinphrase": validated_data.get("joinphrase", ""),
"artist": ArtistSerializer().create(validated_data["artist"]),
}
artist_credit = ArtistCredit.objects.create(**data)
return artist_credit
class ReleaseSerializer(serializers.Serializer):
"""
Serializer for Musicbrainz release data.
"""
id = serializers.CharField()
title = serializers.CharField()
artist_credit = ArtistCreditSerializer(many=True)
tags = serializers.ListField(child=serializers.CharField(), allow_empty=True)
date = serializers.DateField(input_formats=["%Y", "%Y/%m/%d", "%Y-%m-%d"])
def create(self, validated_data):
from funkwhale_api.music.models import Album
data = {
"title": validated_data["title"],
"mbid": validated_data["id"],
"release_date": validated_data.get("date"),
}
album = Album.objects.create(**data)
artist_credit = ArtistCreditSerializer(many=True).create(
validated_data["artist_credit"]
)
album.artist_credit.set(artist_credit)
album.save()
tags_models.add_tags(album, *validated_data.get("tags", []))
return album
def update(self, instance, validated_data):
instance.title = validated_data["title"]
instance.release_date = validated_data.get("date")
instance.save()
tags_models.add_tags(instance, *validated_data.get("tags", []))
return instance
class RecordingSerializer(serializers.Serializer):
"""
Serializer for Musicbrainz track data.
"""
# class Meta:
# model = Track
id = serializers.CharField()
title = serializers.CharField()
artist_credit = ArtistCreditSerializer(many=True)
releases = ReleaseSerializer(many=True)
tags = serializers.ListField(child=serializers.CharField(), allow_empty=True)
def create(self, validated_data):
from funkwhale_api.music.models import Track
data = {
"title": validated_data["title"],
"mbid": validated_data["id"],
# In mb a recording can have various releases, we take the fist one
"album": ReleaseSerializer(many=True).create(validated_data["releases"])[0],
}
track = Track.objects.create(**data)
artist_credit = ArtistCreditSerializer(many=True).create(
validated_data["artist_credit"]
)
track.artist_credit.set(artist_credit)
track.save()
tags_models.add_tags(track, *validated_data.get("tags", []))
return track
def update(self, instance, validated_data):
instance.title = validated_data["title"]
instance.save()
tags_models.add_tags(instance, *validated_data.get("tags", []))
return instance

View File

@ -732,3 +732,87 @@ def test_fetch_webfinger_create_actor(factories, r_mock, mocker):
assert init.call_args[0][1] == actor assert init.call_args[0][1] == actor
assert init.call_args[1]["data"] == payload assert init.call_args[1]["data"] == payload
assert save.call_count == 1 assert save.call_count == 1
def test_third_party_fetch_success(factories, r_mock, mocker):
track = factories["music.Track"]()
url = f"https://musicbrainz.org/recording/{track.mbid}"
fetch = factories["federation.Fetch"](url=url)
payload = {
"releases": [
{
"status": "Promotion",
"id": "220ffb88-49ed-4df4-a330-46f8e7353ff0",
"country": "DE",
"title": "With Oi! To Hope for Myanmar 2022",
"quality": "normal",
"release-events": [
{
"area": {
"name": "Germany",
"id": "85752fda-13c4-31a3-bee5-0e5cb1f51dad",
"sort-name": "Germany",
"disambiguation": "",
"iso-3166-1-codes": ["DE"],
},
"date": "2022",
}
],
"disambiguation": "Version aus 2022",
"status-id": "518ffc83-5cde-34df-8627-81bff5093d92",
"packaging-id": "8f931351-d2e2-310f-afc6-37b89ddba246",
"artist-credit": [
{
"artist": {
"sort-name": "Various Artists",
"name": "Various Artists",
"disambiguation": "add compilations to this artist",
"type": "Other",
"type-id": "ac897045-5043-3294-969b-187360e45d86",
"id": "89ad4ac3-39f7-470e-963a-56509c546377",
},
"joinphrase": "",
"name": "Various Artists",
}
],
"barcode": "",
"date": "2022",
"packaging": "Digipak",
"text-representation": {"language": "mul", "script": "Latn"},
}
],
"disambiguation": "",
"video": False,
"first-release-date": "2022",
"artist-credit": [
{
"artist": {
"name": "The Rebel Riot",
"disambiguation": "",
"sort-name": "Rebel Riot, The",
"type": "Group",
"id": "1ff2cd0c-2ac1-4296-b650-77ef57bb0d01",
"country": "MM",
"type-id": "e431f5f6-b5d2-343d-8b36-72607fffb74b",
},
"name": "The Rebel Riot",
"joinphrase": "",
}
],
"title": "A.C.A.B.",
"id": "455cd030-7394-4244-9a53-3b96a666b1c6",
"length": 193253,
}
init = mocker.spy(serializers.ArtistSerializer, "__init__")
save = mocker.spy(serializers.ArtistSerializer, "save")
r_mock.get(url, json=payload)
tasks.third_party_fetch(fetch_id=fetch.pk)
fetch.refresh_from_db()
assert fetch.status == "finished"
assert init.call_count == 1
assert init.call_args[1]["data"] == payload
assert save.call_count == 1