sound_annalysis/cover_song_stream.py

## tutorial from: https://mtg.github.io/essentia-labs/news/2019/09/05/cover-song-similarity/

#################
# standard part #
#################


import essentia.standard as estd
from essentia.pytools.spectral import hpcpgram

yesterday_original = 'audio/Yesterday (Remastered 2009).mp3'
yesterday_cover_01 = 'audio/Yesterday - The Beatles - Connie Talbot (Cover).mp3'
wrong_song         = 'audio/Bella Poarch - Build a Btch (Official Music Video).mp3'

song_reference = yesterday_original

# query cover song
original_song = estd.MonoLoader(filename=song_reference, sampleRate=32000)()
## Now let’s compute Harmonic Pitch Class Profile (HPCP) chroma features of these audio signals.
true_cover_hpcp = hpcpgram(original_song, sampleRate=32000)


#################
# Straming part #
#################

import essentia.streaming as estr
from essentia import array, run, Pool

query_filename = wrong_song

# Let's instantiate all the required essentia streaming algorithms

audio = estr.MonoLoader(filename=query_filename, sampleRate=32000)
  
frame_cutter = estr.FrameCutter(frameSize=4096, hopSize=2048)

windowing = estr.Windowing(type="blackmanharris62")

spectrum  = estr.Spectrum();

peak = estr.SpectralPeaks(sampleRate=32000)

whitening = estr.SpectralWhitening(maxFrequency=3500,
                                sampleRate=32000);

hpcp = estr.HPCP(sampleRate=32000,
                 minFrequency=100,
                 maxFrequency=3500,
                 size=12);

# Create an instance of streaming ChromaCrossSimilarity algorithm
# With parameter `referenceFeature`, 
# we can pass the pre-computed reference song chroma features.
# In this case, we use the pre-computed HPCP feature 
# of the 'true_cover_song'.
# With parameter `oti`, we can tranpose the pitch 
# of the reference song HPCP feature
# to an given OTI [5] (if it's known before hand).
# By default we set `oti=0`
sim_matrix = estr.ChromaCrossSimilarity(
                referenceFeature=true_cover_hpcp,
                oti=0)

# Create an instance of the cover song similarity alignment algorithm 
# 'pipeDistance=True' stdout distance values for each input stream
alignment = estr.CoverSongSimilarity(pipeDistance=True)

# essentia Pool instance (python dict like object) to aggregrate the outputs  
pool = Pool()

# Connect all the required algorithms in a essentia streaming network
# ie., connecting inputs and outputs of the algorithms 
# in the required workflow and order
audio.audio >> frame_cutter.signal
frame_cutter.frame >> windowing.frame
windowing.frame >> spectrum.frame
spectrum.spectrum >> peak.spectrum
spectrum.spectrum >> whitening.spectrum
peak.magnitudes >> whitening.magnitudes
peak.frequencies >> whitening.frequencies
peak.frequencies >> hpcp.frequencies 
whitening.magnitudes >> hpcp.magnitudes
hpcp.hpcp >> sim_matrix.queryFeature
sim_matrix.csm >> alignment.inputArray
alignment.scoreMatrix >> (pool, 'scoreMatrix')
alignment.distance >> (pool, 'distance')

# Run the algorithm network
run(audio)
# This process will stdout the cover song similarity distance 
# for every input stream in realtime.
# It also aggregrates the Smith-Waterman alignment score matrix 
# and cover song similarity distance for every accumulating 
# input audio stream in an essentia pool instance (similar to a python dict) 
# which can be accessed after the end of the stream.

# Now, let's check the final cover song similarity distance value 
# computed at the last input stream.
print(pool['distance'][-1])