2021-06-05 19:51:04 +00:00
|
|
|
|
## tutorial from: https://mtg.github.io/essentia-labs/news/2019/09/05/cover-song-similarity/
|
|
|
|
|
|
|
|
|
|
#################
|
|
|
|
|
# standard part #
|
|
|
|
|
#################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import essentia.standard as estd
|
|
|
|
|
from essentia.pytools.spectral import hpcpgram
|
|
|
|
|
|
|
|
|
|
yesterday_original = 'audio/Yesterday (Remastered 2009).mp3'
|
|
|
|
|
yesterday_cover_01 = 'audio/Yesterday - The Beatles - Connie Talbot (Cover).mp3'
|
2021-06-05 20:28:26 +00:00
|
|
|
|
wrong_song = 'audio/Jacques Brel - Ne Me Quitte Pas.mp3'
|
2021-06-05 19:51:04 +00:00
|
|
|
|
|
2021-06-05 20:28:26 +00:00
|
|
|
|
song_reference = yesterday_original # the original song analysed in normal mode
|
|
|
|
|
song_streaming = wrong_song # the song get in stream mode to compare to reference
|
2021-06-05 19:51:04 +00:00
|
|
|
|
|
|
|
|
|
# query cover song
|
|
|
|
|
original_song = estd.MonoLoader(filename=song_reference, sampleRate=32000)()
|
|
|
|
|
## Now let’s compute Harmonic Pitch Class Profile (HPCP) chroma features of these audio signals.
|
|
|
|
|
true_cover_hpcp = hpcpgram(original_song, sampleRate=32000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
#################
|
|
|
|
|
# Straming part #
|
|
|
|
|
#################
|
|
|
|
|
|
|
|
|
|
import essentia.streaming as estr
|
|
|
|
|
from essentia import array, run, Pool
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# Let's instantiate all the required essentia streaming algorithms
|
|
|
|
|
|
2021-06-05 20:28:26 +00:00
|
|
|
|
audio = estr.MonoLoader(filename=song_streaming, sampleRate=32000)
|
2021-06-05 19:51:04 +00:00
|
|
|
|
|
|
|
|
|
frame_cutter = estr.FrameCutter(frameSize=4096, hopSize=2048)
|
|
|
|
|
|
|
|
|
|
windowing = estr.Windowing(type="blackmanharris62")
|
|
|
|
|
|
|
|
|
|
spectrum = estr.Spectrum();
|
|
|
|
|
|
|
|
|
|
peak = estr.SpectralPeaks(sampleRate=32000)
|
|
|
|
|
|
|
|
|
|
whitening = estr.SpectralWhitening(maxFrequency=3500,
|
|
|
|
|
sampleRate=32000);
|
|
|
|
|
|
|
|
|
|
hpcp = estr.HPCP(sampleRate=32000,
|
|
|
|
|
minFrequency=100,
|
|
|
|
|
maxFrequency=3500,
|
|
|
|
|
size=12);
|
|
|
|
|
|
|
|
|
|
# Create an instance of streaming ChromaCrossSimilarity algorithm
|
|
|
|
|
# With parameter `referenceFeature`,
|
|
|
|
|
# we can pass the pre-computed reference song chroma features.
|
|
|
|
|
# In this case, we use the pre-computed HPCP feature
|
|
|
|
|
# of the 'true_cover_song'.
|
|
|
|
|
# With parameter `oti`, we can tranpose the pitch
|
|
|
|
|
# of the reference song HPCP feature
|
|
|
|
|
# to an given OTI [5] (if it's known before hand).
|
|
|
|
|
# By default we set `oti=0`
|
|
|
|
|
sim_matrix = estr.ChromaCrossSimilarity(
|
|
|
|
|
referenceFeature=true_cover_hpcp,
|
|
|
|
|
oti=0)
|
|
|
|
|
|
|
|
|
|
# Create an instance of the cover song similarity alignment algorithm
|
|
|
|
|
# 'pipeDistance=True' stdout distance values for each input stream
|
|
|
|
|
alignment = estr.CoverSongSimilarity(pipeDistance=True)
|
|
|
|
|
|
|
|
|
|
# essentia Pool instance (python dict like object) to aggregrate the outputs
|
|
|
|
|
pool = Pool()
|
|
|
|
|
|
|
|
|
|
# Connect all the required algorithms in a essentia streaming network
|
|
|
|
|
# ie., connecting inputs and outputs of the algorithms
|
|
|
|
|
# in the required workflow and order
|
|
|
|
|
audio.audio >> frame_cutter.signal
|
|
|
|
|
frame_cutter.frame >> windowing.frame
|
|
|
|
|
windowing.frame >> spectrum.frame
|
|
|
|
|
spectrum.spectrum >> peak.spectrum
|
|
|
|
|
spectrum.spectrum >> whitening.spectrum
|
|
|
|
|
peak.magnitudes >> whitening.magnitudes
|
|
|
|
|
peak.frequencies >> whitening.frequencies
|
|
|
|
|
peak.frequencies >> hpcp.frequencies
|
|
|
|
|
whitening.magnitudes >> hpcp.magnitudes
|
|
|
|
|
hpcp.hpcp >> sim_matrix.queryFeature
|
|
|
|
|
sim_matrix.csm >> alignment.inputArray
|
|
|
|
|
alignment.scoreMatrix >> (pool, 'scoreMatrix')
|
|
|
|
|
alignment.distance >> (pool, 'distance')
|
|
|
|
|
|
|
|
|
|
# Run the algorithm network
|
|
|
|
|
run(audio)
|
|
|
|
|
# This process will stdout the cover song similarity distance
|
|
|
|
|
# for every input stream in realtime.
|
|
|
|
|
# It also aggregrates the Smith-Waterman alignment score matrix
|
|
|
|
|
# and cover song similarity distance for every accumulating
|
|
|
|
|
# input audio stream in an essentia pool instance (similar to a python dict)
|
|
|
|
|
# which can be accessed after the end of the stream.
|
|
|
|
|
|
|
|
|
|
# Now, let's check the final cover song similarity distance value
|
|
|
|
|
# computed at the last input stream.
|
|
|
|
|
print(pool['distance'][-1])
|
|
|
|
|
|