sound_annalysis/cover_song_stream.py

102 lines
3.4 KiB
Python
Raw Normal View History

## tutorial from: https://mtg.github.io/essentia-labs/news/2019/09/05/cover-song-similarity/
#################
# standard part #
#################
import essentia.standard as estd
from essentia.pytools.spectral import hpcpgram
yesterday_original = 'audio/Yesterday (Remastered 2009).mp3'
yesterday_cover_01 = 'audio/Yesterday - The Beatles - Connie Talbot (Cover).mp3'
wrong_song = 'audio/Bella Poarch - Build a Btch (Official Music Video).mp3'
song_reference = yesterday_original
# query cover song
original_song = estd.MonoLoader(filename=song_reference, sampleRate=32000)()
## Now lets compute Harmonic Pitch Class Profile (HPCP) chroma features of these audio signals.
true_cover_hpcp = hpcpgram(original_song, sampleRate=32000)
#################
# Straming part #
#################
import essentia.streaming as estr
from essentia import array, run, Pool
query_filename = wrong_song
# Let's instantiate all the required essentia streaming algorithms
audio = estr.MonoLoader(filename=query_filename, sampleRate=32000)
frame_cutter = estr.FrameCutter(frameSize=4096, hopSize=2048)
windowing = estr.Windowing(type="blackmanharris62")
spectrum = estr.Spectrum();
peak = estr.SpectralPeaks(sampleRate=32000)
whitening = estr.SpectralWhitening(maxFrequency=3500,
sampleRate=32000);
hpcp = estr.HPCP(sampleRate=32000,
minFrequency=100,
maxFrequency=3500,
size=12);
# Create an instance of streaming ChromaCrossSimilarity algorithm
# With parameter `referenceFeature`,
# we can pass the pre-computed reference song chroma features.
# In this case, we use the pre-computed HPCP feature
# of the 'true_cover_song'.
# With parameter `oti`, we can tranpose the pitch
# of the reference song HPCP feature
# to an given OTI [5] (if it's known before hand).
# By default we set `oti=0`
sim_matrix = estr.ChromaCrossSimilarity(
referenceFeature=true_cover_hpcp,
oti=0)
# Create an instance of the cover song similarity alignment algorithm
# 'pipeDistance=True' stdout distance values for each input stream
alignment = estr.CoverSongSimilarity(pipeDistance=True)
# essentia Pool instance (python dict like object) to aggregrate the outputs
pool = Pool()
# Connect all the required algorithms in a essentia streaming network
# ie., connecting inputs and outputs of the algorithms
# in the required workflow and order
audio.audio >> frame_cutter.signal
frame_cutter.frame >> windowing.frame
windowing.frame >> spectrum.frame
spectrum.spectrum >> peak.spectrum
spectrum.spectrum >> whitening.spectrum
peak.magnitudes >> whitening.magnitudes
peak.frequencies >> whitening.frequencies
peak.frequencies >> hpcp.frequencies
whitening.magnitudes >> hpcp.magnitudes
hpcp.hpcp >> sim_matrix.queryFeature
sim_matrix.csm >> alignment.inputArray
alignment.scoreMatrix >> (pool, 'scoreMatrix')
alignment.distance >> (pool, 'distance')
# Run the algorithm network
run(audio)
# This process will stdout the cover song similarity distance
# for every input stream in realtime.
# It also aggregrates the Smith-Waterman alignment score matrix
# and cover song similarity distance for every accumulating
# input audio stream in an essentia pool instance (similar to a python dict)
# which can be accessed after the end of the stream.
# Now, let's check the final cover song similarity distance value
# computed at the last input stream.
print(pool['distance'][-1])