From 4c9cbc34133c8c46885568e3a922dc644bc9de0d Mon Sep 17 00:00:00 2001 From: Petitminion Date: Sun, 9 Feb 2025 21:19:00 +0100 Subject: [PATCH] wikidata tasks to add artists to db --- api/config/settings/common.py | 5 + .../music/migrations/0063_artist_far_right.py | 17 ++++ api/funkwhale_api/music/tasks.py | 27 +++--- api/funkwhale_api/music/wikidata.py | 90 ++++++++++++++++++ api/poetry.lock | 92 ++++++++++++++++++- api/pyproject.toml | 1 + api/tests/music/test_wikidata.py | 44 +++++++++ docs/specs/far-right-filter/index.md | 18 ++-- 8 files changed, 276 insertions(+), 18 deletions(-) create mode 100644 api/funkwhale_api/music/migrations/0063_artist_far_right.py create mode 100644 api/funkwhale_api/music/wikidata.py create mode 100644 api/tests/music/test_wikidata.py diff --git a/api/config/settings/common.py b/api/config/settings/common.py index 776d73c2e..7afaa4b35 100644 --- a/api/config/settings/common.py +++ b/api/config/settings/common.py @@ -983,6 +983,11 @@ CELERY_BEAT_SCHEDULE = { "schedule": crontab(day_of_month="2", minute="30", hour="3"), "options": {"expires": 60 * 60 * 24}, }, + "music.wikidata_far_righ_artists": { + "task": "music.wikidata_far_righ_artists", + "schedule": crontab(day_of_month="3", minute="30", hour="3"), + "options": {"expires": 60 * 60 * 24}, + }, } if env.str("TYPESENSE_API_KEY", default=None): diff --git a/api/funkwhale_api/music/migrations/0063_artist_far_right.py b/api/funkwhale_api/music/migrations/0063_artist_far_right.py new file mode 100644 index 000000000..85d18c17a --- /dev/null +++ b/api/funkwhale_api/music/migrations/0063_artist_far_right.py @@ -0,0 +1,17 @@ +# Generated by Django 5.1.5 on 2025-02-09 16:43 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + dependencies = [ + ("music", "0062_upload_third_party_provider"), + ] + + operations = [ + migrations.AddField( + model_name="artist", + name="far_right", + field=models.CharField(blank=True, max_length=100, null=True), + ), + ] diff --git a/api/funkwhale_api/music/tasks.py b/api/funkwhale_api/music/tasks.py index 5d815fad1..7f0082822 100644 --- a/api/funkwhale_api/music/tasks.py +++ b/api/funkwhale_api/music/tasks.py @@ -20,12 +20,11 @@ from funkwhale_api.federation import library as lb from funkwhale_api.federation import routes from funkwhale_api.federation import utils as federation_utils from funkwhale_api.music.management.commands import import_files -from funkwhale_api.music.models import Artist from funkwhale_api.tags import models as tags_models from funkwhale_api.tags import tasks as tags_tasks from funkwhale_api.taskapp import celery -from . import licenses, metadata, models, signals +from . import licenses, metadata, models, signals, wikidata logger = logging.getLogger(__name__) @@ -485,15 +484,7 @@ def get_best_candidate_or_create(model, query, defaults, sort_fields): """ candidates = model.objects.filter(query) if candidates: - sorted_candidates = sort_candidates(candidates, sort_fields) - if model == Artist and sorted_candidates[0].far_right: - raise FarRightError( - code="Far right artist detected", - detail=f"The artist name has been matched with this wikidata entity \ - {sorted_candidates[0].far_right}. This artist will not be saved. No pasaran. \ - You can checkout our coc at https://www.funkwhale.audio/code-of-conduct/", - ) - return sorted_candidates[0], False + return sort_candidates(candidates, sort_fields)[0], False return model.objects.create(**defaults), True @@ -821,9 +812,18 @@ def get_or_create_artist_from_ac(ac_data, attributed_to, from_activity_id): } if ac_data.get("fdate"): defaults["creation_date"] = ac_data.get("fdate") + artist, created = get_best_candidate_or_create( models.Artist, query, defaults=defaults, sort_fields=["mbid", "fid"] ) + if artist.far_right: + raise FarRightError( + code="Far right artist detected", + detail=f"The artist name has been matched with this wikidata entity \ + {artist.far_right}. This artist will not be saved. No pasaran. \ + You can checkout our coc at https://www.funkwhale.audio/code-of-conduct/", + ) + if created: tags_models.add_tags(artist, *tags) common_utils.attach_content(artist, "description", description) @@ -1267,3 +1267,8 @@ def fs_import( "broadcast": broadcast, } command.handle(**options) + + +@celery.app.task(name="music.wikidata_far_righ_artists") +def wikidata_far_righ_artists(): + wikidata.get_far_right_artists() diff --git a/api/funkwhale_api/music/wikidata.py b/api/funkwhale_api/music/wikidata.py new file mode 100644 index 000000000..6bc262829 --- /dev/null +++ b/api/funkwhale_api/music/wikidata.py @@ -0,0 +1,90 @@ +import logging +import socket + +import pytest +from django.db.models import Q +from wikibaseintegrator import WikibaseIntegrator, wbi_helpers +from wikibaseintegrator.wbi_config import config as wbi_config + +from funkwhale_api.music.models import Artist + +logger = logging.getLogger(__name__) + + +@pytest.fixture(autouse=True) +def enable_network_calls(): + socket.socket = socket.create_connection + + +WIKIDATA_QUERY = """SELECT DISTINCT ?item ?itemLabel WHERE { + SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". } + + ?item wdt:P31 wd:Q215380. + # Match items with the relevant properties and ensure they have references + { + VALUES ?genre { + wd:Q533914 # NSBM + wd:Q224694 # whit power music + wd:Q113084468 # nazi rock + wd:Q121411631 # neonazi music + wd:Q1547998 # rock identitaire francai + wd:Q602498 # nazi punk + wd:Q3328582 # italian right wing alternative + wd:Q828181 # rock against communism + } + + ?item p:P136 ?statement. + ?statement ps:P136 ?genre. + + + # Ensure that these statements have references + FILTER EXISTS { ?statement prov:wasDerivedFrom ?reference. } + } +} +""" + + +def get_far_right_artists(): + from funkwhale_api.music.tasks import get_best_candidate_or_create + + wbi_config[ + "USER_AGENT" + ] = "Funkwhale_far_righ_artist/1.0 (https://docs.funkwhale.audio/specs/far-right-filter/index.html)" + artists = [] + wbi = WikibaseIntegrator() + + results = wbi_helpers.execute_sparql_query(WIKIDATA_QUERY) + for result in results["results"]["bindings"]: + item_id = result["itemLabel"]["value"] + wkd_artist = wbi.item.get(item_id).get_json() + if wkd_artist["labels"].get("en", False): + artist_name = wkd_artist["labels"]["en"]["value"] + else: + raise ValueError(f"Artist {item_id} has no English label") + + query = Q(name=artist_name) + + if wkd_artist["claims"].get("P434", False): + artist_mbid = wkd_artist["claims"]["P434"][0]["mainsnak"]["datavalue"][ + "value" + ] + query = query & Q(mbid=artist_mbid) + else: + logger.warning( + f"Artist {artist_name} from https://www.wikidata.org/wiki/{item_id} has no MBID. Skipping creation." + ) + continue + + default = {"name": artist_name, "mbid": artist_mbid, "far_right": item_id} + + artist, created = get_best_candidate_or_create( + Artist, query, default, sort_fields=["name"] + ) + + if not created: + artist.far_right = item_id + artist.save() + + artists.append(artist) + + return artists diff --git a/api/poetry.lock b/api/poetry.lock index b0101571d..5ee4822cb 100644 --- a/api/poetry.lock +++ b/api/poetry.lock @@ -339,6 +339,18 @@ files = [ [package.extras] visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"] +[[package]] +name = "backoff" +version = "2.2.1" +description = "Function decoration for backoff and retry" +optional = false +python-versions = ">=3.7,<4.0" +groups = ["main"] +files = [ + {file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"}, + {file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"}, +] + [[package]] name = "billiard" version = "4.2.1" @@ -2516,6 +2528,27 @@ files = [ {file = "mutagen-1.46.0.tar.gz", hash = "sha256:6e5f8ba84836b99fe60be5fb27f84be4ad919bbb6b49caa6ae81e70584b55e58"}, ] +[[package]] +name = "mwoauth" +version = "0.4.0" +description = "A generic MediaWiki OAuth handshake helper." +optional = false +python-versions = "*" +groups = ["main"] +files = [ + {file = "mwoauth-0.4.0-py3-none-any.whl", hash = "sha256:fed9bc7d6bbabb5f691b918af0ac844e13c9b75d5fa51a898f36d54d798b5fe1"}, + {file = "mwoauth-0.4.0.tar.gz", hash = "sha256:22e3403e748e70146f8eccc1430fe542c9f9c4ff677eff424a52e644f6d8f7c5"}, +] + +[package.dependencies] +oauthlib = "*" +PyJWT = ">=1.0.1" +requests = "*" +requests-oauthlib = "*" + +[package.extras] +flask = ["flask"] + [[package]] name = "mypy-extensions" version = "1.0.0" @@ -3156,6 +3189,24 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pyjwt" +version = "2.10.1" +description = "JSON Web Token implementation in Python" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"}, + {file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"}, +] + +[package.extras] +crypto = ["cryptography (>=3.4.0)"] +dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"] +docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"] +tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"] + [[package]] name = "pyld" version = "2.0.4" @@ -3746,6 +3797,25 @@ requests = ">=2.22,<3" [package.extras] fixture = ["fixtures"] +[[package]] +name = "requests-oauthlib" +version = "2.0.0" +description = "OAuthlib authentication support for Requests." +optional = false +python-versions = ">=3.4" +groups = ["main"] +files = [ + {file = "requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9"}, + {file = "requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36"}, +] + +[package.dependencies] +oauthlib = ">=3.0.0" +requests = ">=2.0.0" + +[package.extras] +rsa = ["oauthlib[signedtoken] (>=3.0.0)"] + [[package]] name = "rpds-py" version = "0.22.3" @@ -4916,6 +4986,26 @@ files = [ {file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"}, ] +[[package]] +name = "wikibaseintegrator" +version = "0.12.12" +description = "Python package for reading from and writing to a Wikibase instance" +optional = false +python-versions = "<4.0,>=3.9" +groups = ["main"] +files = [ + {file = "wikibaseintegrator-0.12.12-py3-none-any.whl", hash = "sha256:d39530f994a81ea6baf1c40e069940d8acd61863351bea7907e3dee89ef06ed2"}, + {file = "wikibaseintegrator-0.12.12.tar.gz", hash = "sha256:ce765d7b8ff0f80ddf6f742319f5a68a07701aa69f607580fc65e5acace661b7"}, +] + +[package.dependencies] +backoff = ">=2.2.1,<3.0.0" +mwoauth = ">=0.4.0,<0.5.0" +oauthlib = ">=3.2.2,<4.0.0" +requests = ">=2.32.3,<3.0.0" +requests-oauthlib = ">=2.0.0,<3.0.0" +ujson = ">=5.10.0,<6.0.0" + [[package]] name = "yarl" version = "1.18.3" @@ -5074,4 +5164,4 @@ typesense = ["typesense"] [metadata] lock-version = "2.1" python-versions = "^3.10,<3.14" -content-hash = "5338d2d4fed2085b1c581613a567a79d3b96c602ea0dbc6ef9a882dd0efac036" +content-hash = "88b52c951cc1b39bd6785f8b02bf44bf49a420f7ba90e2958967eec364735962" diff --git a/api/pyproject.toml b/api/pyproject.toml index a7f67fd6a..e8b77dbb4 100644 --- a/api/pyproject.toml +++ b/api/pyproject.toml @@ -90,6 +90,7 @@ troi = "==2025.1.10.0" lb-matching-tools = "==2024.1.30.1" unidecode = "==1.3.8" pycountry = "24.6.1" +WikibaseIntegrator = "==0.12.12" # Typesense typesense = { version = "==0.21.0", optional = true } diff --git a/api/tests/music/test_wikidata.py b/api/tests/music/test_wikidata.py new file mode 100644 index 000000000..6990ab29d --- /dev/null +++ b/api/tests/music/test_wikidata.py @@ -0,0 +1,44 @@ +from unittest.mock import patch + +from funkwhale_api.music import wikidata + + +# to test agaitnst the actual database disable r_mock autouse in conftest.py +def test_get_far_right_artists(): + mock_item_data = { + "labels": {"en": {"value": "Mock Artist"}}, + "claims": { + "P434": [ + { + "mainsnak": { + "datavalue": {"value": "e29eb6ec-7773-41cb-8a72-a575d647e6ab"} + } + } + ] + }, + } + + with patch( + "wikibaseintegrator.wbi_helpers.execute_sparql_query" + ) as mock_query, patch( + "wikibaseintegrator.entities.item.ItemEntity.get" + ) as mock_item_get: + # Mock SPARQL query result + mock_query.return_value = { + "results": { + "bindings": [ + {"itemLabel": {"value": "Q123456"}} # Fake Wikidata item ID + ] + } + } + + # Mock the item.get().get_json() call + mock_item_get.return_value.get_json.return_value = mock_item_data + + # Call the actual function + artists = wikidata.get_far_right_artists() + + assert len(artists) == 1 + assert artists[0].name == "Mock Artist" + assert artists[0].mbid == "e29eb6ec-7773-41cb-8a72-a575d647e6ab" + assert artists[0].far_right == "Q123456" diff --git a/docs/specs/far-right-filter/index.md b/docs/specs/far-right-filter/index.md index 88cba019a..44ab1ad7e 100644 --- a/docs/specs/far-right-filter/index.md +++ b/docs/specs/far-right-filter/index.md @@ -12,17 +12,18 @@ Hard code a filter against far right artists preventing far right movement to us ## Feature behavior -To find a common consensus/definition of far right ideology we will use wikidata. Moderation and debates can happen on their infrastructure. To be transparent about why an artist in being censored, the backend should display the reference of the wikidata object being used to classify the artist has a far right defender. +To find a common consensus/definition of far right ideology we will use wikidata. Moderation and debates can happen on their infrastructure. To be transparent about why an artist in being censored, the backend should display the reference of the wikidata object being used to classify the artist has a far right defender. This way users can go the the related wikidata page to check the reference if needed. ### Backend behavior ### Backend -- [ ] a cli tool to display the list of right wing artists that display the name, the mbid and the wikidata ref. Prompt a warning if mbid is missing so admins can add a mbid. -- [ ] A new database table to save the list of artists OR a new artist attribute `right_wing_extremism` displaying the wikidata id ? since we don't want to bother with moderation, we can only add an attribute. -- [ ] Display an explicit api error response that explain why the artist in banned (link the feature documentation and the artist wikidata id : - - [ ] on the import process : display an explicit error during import. - - [ ] on the federation artist serializers : +- [x] a celery tasks that add far_right artists to the db, or update existing artist with the far_right attribute if the name matches. Launched every month. +- [ ] a cli tool to display the list of right wing artists that display the name, the mbid and the wikidata ref. Prompt a warning if mbid is missing so admins can add a mbid +- [x] A new artist attribute `far_right` displaying the wikidata id +- [ ] Display an explicit api error response that explain why the artist in banned (link the funkwhale code of conduct and the artist wikidata id): + - [x] on the import process : display an explicit error during import + - [ ] on the federation artist serializers workflow : querying wikidata -> create or update artist entries with the new `far_right` attribute -> filter out the artist based on the attribute and display logging info explaining why @@ -56,6 +57,11 @@ SELECT DISTINCT ?item ?itemLabel WHERE { } ``` +#### Wikidata requirements + +- To get the musibrainz id of the artist we use : https://www.wikidata.org/wiki/P434 +- To get the artist name we use the english label + #### Import get_or_create_artists_credits_from_musicbrainz