See #432: tags acquisition (from audio files)

This commit is contained in:
Eliot Berriot 2019-07-12 15:06:39 +02:00
parent 8ee5578693
commit 7bc8109732
10 changed files with 119 additions and 4 deletions

View File

@ -2,6 +2,7 @@ import base64
import datetime import datetime
import logging import logging
import pendulum import pendulum
import re
import mutagen._util import mutagen._util
import mutagen.oggtheora import mutagen.oggtheora
@ -144,6 +145,7 @@ CONF = {
"mbid": {"field": "musicbrainz_trackid"}, "mbid": {"field": "musicbrainz_trackid"},
"license": {}, "license": {},
"copyright": {}, "copyright": {},
"genre": {},
}, },
}, },
"OggVorbis": { "OggVorbis": {
@ -162,6 +164,7 @@ CONF = {
"mbid": {"field": "musicbrainz_trackid"}, "mbid": {"field": "musicbrainz_trackid"},
"license": {}, "license": {},
"copyright": {}, "copyright": {},
"genre": {},
"pictures": { "pictures": {
"field": "metadata_block_picture", "field": "metadata_block_picture",
"to_application": clean_ogg_pictures, "to_application": clean_ogg_pictures,
@ -184,6 +187,7 @@ CONF = {
"mbid": {"field": "MusicBrainz Track Id"}, "mbid": {"field": "MusicBrainz Track Id"},
"license": {}, "license": {},
"copyright": {}, "copyright": {},
"genre": {},
}, },
}, },
"MP3": { "MP3": {
@ -199,6 +203,7 @@ CONF = {
"date": {"field": "TDRC"}, "date": {"field": "TDRC"},
"musicbrainz_albumid": {"field": "MusicBrainz Album Id"}, "musicbrainz_albumid": {"field": "MusicBrainz Album Id"},
"musicbrainz_artistid": {"field": "MusicBrainz Artist Id"}, "musicbrainz_artistid": {"field": "MusicBrainz Artist Id"},
"genre": {"field": "TCON"},
"musicbrainz_albumartistid": {"field": "MusicBrainz Album Artist Id"}, "musicbrainz_albumartistid": {"field": "MusicBrainz Album Artist Id"},
"mbid": {"field": "UFID", "getter": get_mp3_recording_id}, "mbid": {"field": "UFID", "getter": get_mp3_recording_id},
"pictures": {}, "pictures": {},
@ -220,6 +225,7 @@ CONF = {
"musicbrainz_albumid": {}, "musicbrainz_albumid": {},
"musicbrainz_artistid": {}, "musicbrainz_artistid": {},
"musicbrainz_albumartistid": {}, "musicbrainz_albumartistid": {},
"genre": {},
"mbid": {"field": "musicbrainz_trackid"}, "mbid": {"field": "musicbrainz_trackid"},
"test": {}, "test": {},
"pictures": {}, "pictures": {},
@ -485,6 +491,61 @@ class PermissiveDateField(serializers.CharField):
return None return None
TAG_REGEX = re.compile(r"^((\w+)([\d_]*))$")
def extract_tags_from_genre(string):
tags = []
delimiter = "@@@@@"
for d in [" - ", ",", ";", "/"]:
# Replace common tags separators by a custom delimiter
string = string.replace(d, delimiter)
# loop on the parts (splitting on our custom delimiter)
for tag in string.split(delimiter):
tag = tag.strip()
for d in ["-"]:
# preparation for replacement so that Pop-Rock becomes Pop Rock, then PopRock
# (step 1, step 2 happens below)
tag = tag.replace(d, " ")
if not tag:
continue
final_tag = ""
if not TAG_REGEX.match(tag.replace(" ", "")):
# the string contains some non words chars ($, €, etc.), right now
# we simply skip such tags
continue
# concatenate the parts and uppercase them so that 'pop rock' becomes 'PopRock'
if len(tag.split(" ")) == 1:
# we append the tag "as is", because it doesn't contain any space
tags.append(tag)
continue
for part in tag.split(" "):
# the tag contains space, there's work to do to have consistent case
# 'pop rock' -> 'PopRock'
# (step 2)
if not part:
continue
final_tag += part[0].upper() + part[1:]
if final_tag:
tags.append(final_tag)
return tags
class TagsField(serializers.CharField):
def get_value(self, data):
return data
def to_internal_value(self, data):
try:
value = data.get("genre") or ""
except TagNotFound:
return []
value = super().to_internal_value(str(value))
return extract_tags_from_genre(value)
class MBIDField(serializers.UUIDField): class MBIDField(serializers.UUIDField):
def __init__(self, *args, **kwargs): def __init__(self, *args, **kwargs):
kwargs.setdefault("allow_null", True) kwargs.setdefault("allow_null", True)
@ -533,6 +594,7 @@ class TrackMetadataSerializer(serializers.Serializer):
copyright = serializers.CharField(allow_blank=True, allow_null=True, required=False) copyright = serializers.CharField(allow_blank=True, allow_null=True, required=False)
license = serializers.CharField(allow_blank=True, allow_null=True, required=False) license = serializers.CharField(allow_blank=True, allow_null=True, required=False)
mbid = MBIDField() mbid = MBIDField()
tags = TagsField(allow_blank=True, allow_null=True, required=False)
album = AlbumField() album = AlbumField()
artists = ArtistField() artists = ArtistField()
@ -544,6 +606,7 @@ class TrackMetadataSerializer(serializers.Serializer):
"position", "position",
"disc_number", "disc_number",
"mbid", "mbid",
"tags",
] ]
def validate(self, validated_data): def validate(self, validated_data):
@ -553,7 +616,7 @@ class TrackMetadataSerializer(serializers.Serializer):
v = validated_data[field] v = validated_data[field]
except KeyError: except KeyError:
continue continue
if v in ["", None]: if v in ["", None, []]:
validated_data.pop(field) validated_data.pop(field)
return validated_data return validated_data

View File

@ -14,6 +14,7 @@ from requests.exceptions import RequestException
from funkwhale_api.common import channels, preferences from funkwhale_api.common import channels, preferences
from funkwhale_api.federation import routes from funkwhale_api.federation import routes
from funkwhale_api.federation import library as lb from funkwhale_api.federation import library as lb
from funkwhale_api.tags import models as tags_models
from funkwhale_api.taskapp import celery from funkwhale_api.taskapp import celery
from . import licenses from . import licenses
@ -541,10 +542,12 @@ def _get_track(data, attributed_to=None):
if data.get("fdate"): if data.get("fdate"):
defaults["creation_date"] = data.get("fdate") defaults["creation_date"] = data.get("fdate")
track = get_best_candidate_or_create( track, created = get_best_candidate_or_create(
models.Track, query, defaults=defaults, sort_fields=["mbid", "fid"] models.Track, query, defaults=defaults, sort_fields=["mbid", "fid"]
)[0] )
if created:
tags_models.add_tags(track, *data.get("tags", []))
return track return track

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@ -57,6 +57,7 @@ def test_can_get_metadata_all():
"musicbrainz_albumartistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823;5b4d7d2d-36df-4b38-95e3-a964234f520f", "musicbrainz_albumartistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823;5b4d7d2d-36df-4b38-95e3-a964234f520f",
"license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/",
"copyright": "Someone", "copyright": "Someone",
"genre": "Classical",
} }
assert data.all() == expected assert data.all() == expected
@ -249,6 +250,7 @@ def test_metadata_fallback_ogg_theora(mocker):
"mbid": uuid.UUID("f269d497-1cc0-4ae4-a0c4-157ec7d73fcb"), "mbid": uuid.UUID("f269d497-1cc0-4ae4-a0c4-157ec7d73fcb"),
"license": "https://creativecommons.org/licenses/by-nc-nd/2.5/", "license": "https://creativecommons.org/licenses/by-nc-nd/2.5/",
"copyright": "Someone", "copyright": "Someone",
"tags": ["Funk"],
}, },
), ),
( (
@ -281,6 +283,7 @@ def test_metadata_fallback_ogg_theora(mocker):
"mbid": uuid.UUID("bd21ac48-46d8-4e78-925f-d9cc2a294656"), "mbid": uuid.UUID("bd21ac48-46d8-4e78-925f-d9cc2a294656"),
"license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/",
"copyright": "Someone", "copyright": "Someone",
"tags": ["Classical"],
}, },
), ),
( (
@ -313,6 +316,7 @@ def test_metadata_fallback_ogg_theora(mocker):
"mbid": uuid.UUID("bd21ac48-46d8-4e78-925f-d9cc2a294656"), "mbid": uuid.UUID("bd21ac48-46d8-4e78-925f-d9cc2a294656"),
"license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/",
"copyright": "Someone", "copyright": "Someone",
"tags": ["Classical"],
}, },
), ),
( (
@ -336,6 +340,7 @@ def test_metadata_fallback_ogg_theora(mocker):
} }
], ],
}, },
"tags": ["Rock"],
"position": 1, "position": 1,
"disc_number": 1, "disc_number": 1,
"mbid": uuid.UUID("124d0150-8627-46bc-bc14-789a3bc960c8"), "mbid": uuid.UUID("124d0150-8627-46bc-bc14-789a3bc960c8"),
@ -371,6 +376,7 @@ def test_metadata_fallback_ogg_theora(mocker):
"mbid": uuid.UUID("30f3f33e-8d0c-4e69-8539-cbd701d18f28"), "mbid": uuid.UUID("30f3f33e-8d0c-4e69-8539-cbd701d18f28"),
"license": "http://creativecommons.org/licenses/by-nc-sa/3.0/us/", "license": "http://creativecommons.org/licenses/by-nc-sa/3.0/us/",
"copyright": "2008 nin", "copyright": "2008 nin",
"tags": ["Industrial"],
}, },
), ),
], ],
@ -607,3 +613,43 @@ def test_artist_field_featuring():
value = field.get_value(data) value = field.get_value(data)
assert field.to_internal_value(value) == expected assert field.to_internal_value(value) == expected
@pytest.mark.parametrize(
"genre, expected_tags",
[
("Pop", ["Pop"]),
("pop", ["pop"]),
("Pop-Rock", ["PopRock"]),
("Pop - Rock", ["Pop", "Rock"]),
("Soundtrack - Cute Anime", ["Soundtrack", "CuteAnime"]),
("Pop, Rock", ["Pop", "Rock"]),
("Chanson française", ["ChansonFrançaise"]),
("Unhandled❤", []),
("tagwithnon-breakingspaces", []),
],
)
def test_acquire_tags_from_genre(genre, expected_tags):
data = {
"title": "Track Title",
"artist": "Track Artist",
"album": "Track Album",
"genre": genre,
}
expected = {
"title": "Track Title",
"artists": [{"name": "Track Artist", "mbid": None}],
"album": {
"title": "Track Album",
"mbid": None,
"release_date": None,
"artists": [],
},
"cover_data": None,
}
if expected_tags:
expected["tags"] = expected_tags
serializer = metadata.TrackMetadataSerializer(data=metadata.FakeMetadata(data))
assert serializer.is_valid(raise_exception=True) is True
assert serializer.validated_data == expected

View File

@ -448,7 +448,7 @@ def test_get_audio_data(factories):
result = upload.get_audio_data() result = upload.get_audio_data()
assert result == {"duration": 1, "bitrate": 112000, "size": 14858} assert result == {"duration": 1, "bitrate": 112000, "size": 15918}
def test_library_queryset_with_follows(factories): def test_library_queryset_with_follows(factories):

View File

@ -18,6 +18,7 @@ DATA_DIR = os.path.dirname(os.path.abspath(__file__))
def test_can_create_track_from_file_metadata_no_mbid(db, mocker): def test_can_create_track_from_file_metadata_no_mbid(db, mocker):
add_tags = mocker.patch("funkwhale_api.tags.models.add_tags")
metadata = { metadata = {
"title": "Test track", "title": "Test track",
"artists": [{"name": "Test artist"}], "artists": [{"name": "Test artist"}],
@ -26,6 +27,7 @@ def test_can_create_track_from_file_metadata_no_mbid(db, mocker):
"disc_number": 2, "disc_number": 2,
"license": "Hello world: http://creativecommons.org/licenses/by-sa/4.0/", "license": "Hello world: http://creativecommons.org/licenses/by-sa/4.0/",
"copyright": "2018 Someone", "copyright": "2018 Someone",
"tags": ["Punk", "Rock"],
} }
match_license = mocker.spy(licenses, "match") match_license = mocker.spy(licenses, "match")
@ -44,6 +46,7 @@ def test_can_create_track_from_file_metadata_no_mbid(db, mocker):
assert track.artist.mbid is None assert track.artist.mbid is None
assert track.artist.attributed_to is None assert track.artist.attributed_to is None
match_license.assert_called_once_with(metadata["license"], metadata["copyright"]) match_license.assert_called_once_with(metadata["license"], metadata["copyright"])
add_tags.assert_called_once_with(track, *metadata["tags"])
def test_can_create_track_from_file_metadata_attributed_to(factories, mocker): def test_can_create_track_from_file_metadata_attributed_to(factories, mocker):