wikidata tasks to add artists to db

This commit is contained in:
Petitminion 2025-02-09 21:19:00 +01:00
parent 07c8a0c24c
commit 4c9cbc3413
8 changed files with 276 additions and 18 deletions

View File

@ -983,6 +983,11 @@ CELERY_BEAT_SCHEDULE = {
"schedule": crontab(day_of_month="2", minute="30", hour="3"),
"options": {"expires": 60 * 60 * 24},
},
"music.wikidata_far_righ_artists": {
"task": "music.wikidata_far_righ_artists",
"schedule": crontab(day_of_month="3", minute="30", hour="3"),
"options": {"expires": 60 * 60 * 24},
},
}
if env.str("TYPESENSE_API_KEY", default=None):

View File

@ -0,0 +1,17 @@
# Generated by Django 5.1.5 on 2025-02-09 16:43
from django.db import migrations, models
class Migration(migrations.Migration):
dependencies = [
("music", "0062_upload_third_party_provider"),
]
operations = [
migrations.AddField(
model_name="artist",
name="far_right",
field=models.CharField(blank=True, max_length=100, null=True),
),
]

View File

@ -20,12 +20,11 @@ from funkwhale_api.federation import library as lb
from funkwhale_api.federation import routes
from funkwhale_api.federation import utils as federation_utils
from funkwhale_api.music.management.commands import import_files
from funkwhale_api.music.models import Artist
from funkwhale_api.tags import models as tags_models
from funkwhale_api.tags import tasks as tags_tasks
from funkwhale_api.taskapp import celery
from . import licenses, metadata, models, signals
from . import licenses, metadata, models, signals, wikidata
logger = logging.getLogger(__name__)
@ -485,15 +484,7 @@ def get_best_candidate_or_create(model, query, defaults, sort_fields):
"""
candidates = model.objects.filter(query)
if candidates:
sorted_candidates = sort_candidates(candidates, sort_fields)
if model == Artist and sorted_candidates[0].far_right:
raise FarRightError(
code="Far right artist detected",
detail=f"The artist name has been matched with this wikidata entity \
{sorted_candidates[0].far_right}. This artist will not be saved. No pasaran. \
You can checkout our coc at https://www.funkwhale.audio/code-of-conduct/",
)
return sorted_candidates[0], False
return sort_candidates(candidates, sort_fields)[0], False
return model.objects.create(**defaults), True
@ -821,9 +812,18 @@ def get_or_create_artist_from_ac(ac_data, attributed_to, from_activity_id):
}
if ac_data.get("fdate"):
defaults["creation_date"] = ac_data.get("fdate")
artist, created = get_best_candidate_or_create(
models.Artist, query, defaults=defaults, sort_fields=["mbid", "fid"]
)
if artist.far_right:
raise FarRightError(
code="Far right artist detected",
detail=f"The artist name has been matched with this wikidata entity \
{artist.far_right}. This artist will not be saved. No pasaran. \
You can checkout our coc at https://www.funkwhale.audio/code-of-conduct/",
)
if created:
tags_models.add_tags(artist, *tags)
common_utils.attach_content(artist, "description", description)
@ -1267,3 +1267,8 @@ def fs_import(
"broadcast": broadcast,
}
command.handle(**options)
@celery.app.task(name="music.wikidata_far_righ_artists")
def wikidata_far_righ_artists():
wikidata.get_far_right_artists()

View File

@ -0,0 +1,90 @@
import logging
import socket
import pytest
from django.db.models import Q
from wikibaseintegrator import WikibaseIntegrator, wbi_helpers
from wikibaseintegrator.wbi_config import config as wbi_config
from funkwhale_api.music.models import Artist
logger = logging.getLogger(__name__)
@pytest.fixture(autouse=True)
def enable_network_calls():
socket.socket = socket.create_connection
WIKIDATA_QUERY = """SELECT DISTINCT ?item ?itemLabel WHERE {
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
?item wdt:P31 wd:Q215380.
# Match items with the relevant properties and ensure they have references
{
VALUES ?genre {
wd:Q533914 # NSBM
wd:Q224694 # whit power music
wd:Q113084468 # nazi rock
wd:Q121411631 # neonazi music
wd:Q1547998 # rock identitaire francai
wd:Q602498 # nazi punk
wd:Q3328582 # italian right wing alternative
wd:Q828181 # rock against communism
}
?item p:P136 ?statement.
?statement ps:P136 ?genre.
# Ensure that these statements have references
FILTER EXISTS { ?statement prov:wasDerivedFrom ?reference. }
}
}
"""
def get_far_right_artists():
from funkwhale_api.music.tasks import get_best_candidate_or_create
wbi_config[
"USER_AGENT"
] = "Funkwhale_far_righ_artist/1.0 (https://docs.funkwhale.audio/specs/far-right-filter/index.html)"
artists = []
wbi = WikibaseIntegrator()
results = wbi_helpers.execute_sparql_query(WIKIDATA_QUERY)
for result in results["results"]["bindings"]:
item_id = result["itemLabel"]["value"]
wkd_artist = wbi.item.get(item_id).get_json()
if wkd_artist["labels"].get("en", False):
artist_name = wkd_artist["labels"]["en"]["value"]
else:
raise ValueError(f"Artist {item_id} has no English label")
query = Q(name=artist_name)
if wkd_artist["claims"].get("P434", False):
artist_mbid = wkd_artist["claims"]["P434"][0]["mainsnak"]["datavalue"][
"value"
]
query = query & Q(mbid=artist_mbid)
else:
logger.warning(
f"Artist {artist_name} from https://www.wikidata.org/wiki/{item_id} has no MBID. Skipping creation."
)
continue
default = {"name": artist_name, "mbid": artist_mbid, "far_right": item_id}
artist, created = get_best_candidate_or_create(
Artist, query, default, sort_fields=["name"]
)
if not created:
artist.far_right = item_id
artist.save()
artists.append(artist)
return artists

92
api/poetry.lock generated
View File

@ -339,6 +339,18 @@ files = [
[package.extras]
visualize = ["Twisted (>=16.1.1)", "graphviz (>0.5.1)"]
[[package]]
name = "backoff"
version = "2.2.1"
description = "Function decoration for backoff and retry"
optional = false
python-versions = ">=3.7,<4.0"
groups = ["main"]
files = [
{file = "backoff-2.2.1-py3-none-any.whl", hash = "sha256:63579f9a0628e06278f7e47b7d7d5b6ce20dc65c5e96a6f3ca99a6adca0396e8"},
{file = "backoff-2.2.1.tar.gz", hash = "sha256:03f829f5bb1923180821643f8753b0502c3b682293992485b0eef2807afa5cba"},
]
[[package]]
name = "billiard"
version = "4.2.1"
@ -2516,6 +2528,27 @@ files = [
{file = "mutagen-1.46.0.tar.gz", hash = "sha256:6e5f8ba84836b99fe60be5fb27f84be4ad919bbb6b49caa6ae81e70584b55e58"},
]
[[package]]
name = "mwoauth"
version = "0.4.0"
description = "A generic MediaWiki OAuth handshake helper."
optional = false
python-versions = "*"
groups = ["main"]
files = [
{file = "mwoauth-0.4.0-py3-none-any.whl", hash = "sha256:fed9bc7d6bbabb5f691b918af0ac844e13c9b75d5fa51a898f36d54d798b5fe1"},
{file = "mwoauth-0.4.0.tar.gz", hash = "sha256:22e3403e748e70146f8eccc1430fe542c9f9c4ff677eff424a52e644f6d8f7c5"},
]
[package.dependencies]
oauthlib = "*"
PyJWT = ">=1.0.1"
requests = "*"
requests-oauthlib = "*"
[package.extras]
flask = ["flask"]
[[package]]
name = "mypy-extensions"
version = "1.0.0"
@ -3156,6 +3189,24 @@ files = [
[package.extras]
windows-terminal = ["colorama (>=0.4.6)"]
[[package]]
name = "pyjwt"
version = "2.10.1"
description = "JSON Web Token implementation in Python"
optional = false
python-versions = ">=3.9"
groups = ["main"]
files = [
{file = "PyJWT-2.10.1-py3-none-any.whl", hash = "sha256:dcdd193e30abefd5debf142f9adfcdd2b58004e644f25406ffaebd50bd98dacb"},
{file = "pyjwt-2.10.1.tar.gz", hash = "sha256:3cc5772eb20009233caf06e9d8a0577824723b44e6648ee0a2aedb6cf9381953"},
]
[package.extras]
crypto = ["cryptography (>=3.4.0)"]
dev = ["coverage[toml] (==5.0.4)", "cryptography (>=3.4.0)", "pre-commit", "pytest (>=6.0.0,<7.0.0)", "sphinx", "sphinx-rtd-theme", "zope.interface"]
docs = ["sphinx", "sphinx-rtd-theme", "zope.interface"]
tests = ["coverage[toml] (==5.0.4)", "pytest (>=6.0.0,<7.0.0)"]
[[package]]
name = "pyld"
version = "2.0.4"
@ -3746,6 +3797,25 @@ requests = ">=2.22,<3"
[package.extras]
fixture = ["fixtures"]
[[package]]
name = "requests-oauthlib"
version = "2.0.0"
description = "OAuthlib authentication support for Requests."
optional = false
python-versions = ">=3.4"
groups = ["main"]
files = [
{file = "requests-oauthlib-2.0.0.tar.gz", hash = "sha256:b3dffaebd884d8cd778494369603a9e7b58d29111bf6b41bdc2dcd87203af4e9"},
{file = "requests_oauthlib-2.0.0-py2.py3-none-any.whl", hash = "sha256:7dd8a5c40426b779b0868c404bdef9768deccf22749cde15852df527e6269b36"},
]
[package.dependencies]
oauthlib = ">=3.0.0"
requests = ">=2.0.0"
[package.extras]
rsa = ["oauthlib[signedtoken] (>=3.0.0)"]
[[package]]
name = "rpds-py"
version = "0.22.3"
@ -4916,6 +4986,26 @@ files = [
{file = "websockets-14.1.tar.gz", hash = "sha256:398b10c77d471c0aab20a845e7a60076b6390bfdaac7a6d2edb0d2c59d75e8d8"},
]
[[package]]
name = "wikibaseintegrator"
version = "0.12.12"
description = "Python package for reading from and writing to a Wikibase instance"
optional = false
python-versions = "<4.0,>=3.9"
groups = ["main"]
files = [
{file = "wikibaseintegrator-0.12.12-py3-none-any.whl", hash = "sha256:d39530f994a81ea6baf1c40e069940d8acd61863351bea7907e3dee89ef06ed2"},
{file = "wikibaseintegrator-0.12.12.tar.gz", hash = "sha256:ce765d7b8ff0f80ddf6f742319f5a68a07701aa69f607580fc65e5acace661b7"},
]
[package.dependencies]
backoff = ">=2.2.1,<3.0.0"
mwoauth = ">=0.4.0,<0.5.0"
oauthlib = ">=3.2.2,<4.0.0"
requests = ">=2.32.3,<3.0.0"
requests-oauthlib = ">=2.0.0,<3.0.0"
ujson = ">=5.10.0,<6.0.0"
[[package]]
name = "yarl"
version = "1.18.3"
@ -5074,4 +5164,4 @@ typesense = ["typesense"]
[metadata]
lock-version = "2.1"
python-versions = "^3.10,<3.14"
content-hash = "5338d2d4fed2085b1c581613a567a79d3b96c602ea0dbc6ef9a882dd0efac036"
content-hash = "88b52c951cc1b39bd6785f8b02bf44bf49a420f7ba90e2958967eec364735962"

View File

@ -90,6 +90,7 @@ troi = "==2025.1.10.0"
lb-matching-tools = "==2024.1.30.1"
unidecode = "==1.3.8"
pycountry = "24.6.1"
WikibaseIntegrator = "==0.12.12"
# Typesense
typesense = { version = "==0.21.0", optional = true }

View File

@ -0,0 +1,44 @@
from unittest.mock import patch
from funkwhale_api.music import wikidata
# to test agaitnst the actual database disable r_mock autouse in conftest.py
def test_get_far_right_artists():
mock_item_data = {
"labels": {"en": {"value": "Mock Artist"}},
"claims": {
"P434": [
{
"mainsnak": {
"datavalue": {"value": "e29eb6ec-7773-41cb-8a72-a575d647e6ab"}
}
}
]
},
}
with patch(
"wikibaseintegrator.wbi_helpers.execute_sparql_query"
) as mock_query, patch(
"wikibaseintegrator.entities.item.ItemEntity.get"
) as mock_item_get:
# Mock SPARQL query result
mock_query.return_value = {
"results": {
"bindings": [
{"itemLabel": {"value": "Q123456"}} # Fake Wikidata item ID
]
}
}
# Mock the item.get().get_json() call
mock_item_get.return_value.get_json.return_value = mock_item_data
# Call the actual function
artists = wikidata.get_far_right_artists()
assert len(artists) == 1
assert artists[0].name == "Mock Artist"
assert artists[0].mbid == "e29eb6ec-7773-41cb-8a72-a575d647e6ab"
assert artists[0].far_right == "Q123456"

View File

@ -12,17 +12,18 @@ Hard code a filter against far right artists preventing far right movement to us
## Feature behavior
To find a common consensus/definition of far right ideology we will use wikidata. Moderation and debates can happen on their infrastructure. To be transparent about why an artist in being censored, the backend should display the reference of the wikidata object being used to classify the artist has a far right defender.
To find a common consensus/definition of far right ideology we will use wikidata. Moderation and debates can happen on their infrastructure. To be transparent about why an artist in being censored, the backend should display the reference of the wikidata object being used to classify the artist has a far right defender. This way users can go the the related wikidata page to check the reference if needed.
### Backend behavior
### Backend
- [ ] a cli tool to display the list of right wing artists that display the name, the mbid and the wikidata ref. Prompt a warning if mbid is missing so admins can add a mbid.
- [ ] A new database table to save the list of artists OR a new artist attribute `right_wing_extremism` displaying the wikidata id ? since we don't want to bother with moderation, we can only add an attribute.
- [ ] Display an explicit api error response that explain why the artist in banned (link the feature documentation and the artist wikidata id :
- [ ] on the import process : display an explicit error during import.
- [ ] on the federation artist serializers :
- [x] a celery tasks that add far_right artists to the db, or update existing artist with the far_right attribute if the name matches. Launched every month.
- [ ] a cli tool to display the list of right wing artists that display the name, the mbid and the wikidata ref. Prompt a warning if mbid is missing so admins can add a mbid
- [x] A new artist attribute `far_right` displaying the wikidata id
- [ ] Display an explicit api error response that explain why the artist in banned (link the funkwhale code of conduct and the artist wikidata id):
- [x] on the import process : display an explicit error during import
- [ ] on the federation artist serializers
workflow : querying wikidata -> create or update artist entries with the new `far_right` attribute -> filter out the artist based on the attribute and display logging info explaining why
@ -56,6 +57,11 @@ SELECT DISTINCT ?item ?itemLabel WHERE {
}
```
#### Wikidata requirements
- To get the musibrainz id of the artist we use : https://www.wikidata.org/wiki/P434
- To get the artist name we use the english label
#### Import
get_or_create_artists_credits_from_musicbrainz