musicbrainz-5-star-rated.ndjson.gzip
{
"displayedAttributes": [
"title",
"album",
"artist",
"genres",
"released_year",
"track_rating",
"artist_rating",
"last_updated",
"duration",
"status",
"image_url"
],
"searchableAttributes": [
"artist",
"title",
"album"
],
"rankingRules": [
"words",
"typo",
"proximity",
"attribute",
"track_rating:desc",
"sort",
"exactness"
],
"filterableAttributes": [
"genres",
"album",
"artist",
"duration",
"status",
"released_year",
"track_rating",
"artist_rating",
"last_updated_timestamp"
],
"sortableAttributes": [
"duration",
"track_rating",
"artist_rating",
"released_year"
],
"proximityPrecision": "byWord"
}
We noticed that some basic queries were not showing good results. This is why we propose an alternative dataset with alternative settings. The dataset is named musicbrainz-rated.ndjson.gz
and the settings are the following. You can notice a new rating
field that is the result of artist_rating * artist_rating_count
) generated using this small Rust program.
{
"searchableAttributes": [
"artist",
"title",
"album"
],
"rankingRules": [
"words",
"typo",
"proximity",
"rating:desc",
"attribute",
"sort",
"exactness"
]
}
I optimized the scores even further by adding a 5-star ranking to the documents. The musicbrainz-weighted-rating.ndjson.gzip
also has an _vectors
field to make you use Meilisearch to auto generate the vectors but only for the top 562k musics. Here are the settings you can use:
{
"filterableAttributes": [
"album",
"artist",
"duration",
"genres",
"rating",
"released_year"
],
"sortableAttributes": [
"rating",
"artist_rating",
"duration",
"released_year",
"track_rating",
"raw_rating",
"rating"
],
"rankingRules": [
"words",
"typo",
"proximity",
"rating:desc",
"attribute",
"sort",
"exactness"
],
"embedders": {
"OpenAI": {
"source": "openAi",
"model": "text-embedding-3-small",
"apiKey": "sk-fvXXXXXX...",
"dimensions": 1536,
"documentTemplate": "A song titled {{doc.title}} by {{doc.artist}} released in {{doc.released_year}} categorised as {{doc.genres}}"
}
},
"faceting": {
"sortFacetValuesBy": {
"*": "alpha",
"artist": "count"
}
}
}
You can follow the installation instructions at this link (we recommend creating a new Ubuntu machine and installing Postgres 16).
SELECT
t.id AS id,
t.name AS title,
r.name AS album,
-- <https://coverartarchive.org/release/86c90c7b-837d-48de-98ca-6f9fdf2c48dd/front-250>
CONCAT('<https://coverartarchive.org/release/>', r.gid, '/front-250') AS image_url,
a.name AS artist,
round(t.length / 1000.0, 2) AS duration,
array_to_string(array_agg(DISTINCT ta.name), ',') AS genres,
rs.name AS status,
rm.rating AS track_rating,
am.rating AS artist_rating,
am.rating_count AS artist_rating_count,
re.date_year AS released_year,
m.last_updated AS last_updated,
extract(epoch from m.last_updated at time zone 'utc') AS last_updated_timestamp
-- r.id AS release_id
-- rec.name AS better_artist
FROM
musicbrainz.track t
LEFT JOIN musicbrainz.medium m ON m.id = t.medium
LEFT JOIN musicbrainz.release r ON r.id = m.release
LEFT JOIN musicbrainz.recording rec ON rec.id = t.recording
LEFT JOIN musicbrainz.artist_credit a ON a.id = rec.artist_credit
LEFT JOIN musicbrainz.release_tag rt ON rt.release = r.id
LEFT JOIN musicbrainz.tag ta ON rt.tag = ta.id
LEFT JOIN musicbrainz.release_status rs ON rs.id = r.status
LEFT JOIN musicbrainz.release_event re ON re.release = r.id
LEFT JOIN musicbrainz.artist_meta am ON a.id = am.id
LEFT JOIN musicbrainz.recording_meta rm ON t.recording = rm.id
--WHERE
-- t.id = 4852710
-- t.id = 4693112
-- r.name = 'Vince Staples'
-- r.name = 'DAMN.'
-- r.name = 'IXION: Original Soundtrack'
-- r.name = 'Thriller' AND a.name = 'Michael Jackson'
-- r.name = 'Give Me the Night' AND a.name = 'George Benson'
GROUP BY
r.id,
t.id,
a.id,
m.id,
rs.id,
re.date_year,
am.rating,
am.rating_count,
rm.rating;
-- After you've given the rights to write a file to disk with the COPY command:
-- grant pg_write_server_files to musicbrainz;
--
-- You can execute this command from your favorite SQL client.
-- Note that $QUERY corresponds to the query above without the final ';'
-- COPY ($QUERY) to '/tmp/musicbrainz.csv' CSV HEADER FORCE QUOTE *;