Monday, July 26, 2021

How to convert local video file to text transcript

 SpeechRecognition package supports may APIs like

recognize_bing()
recognize_google()
recognize_google_cloud()
recognize_houndify()
recognize_ibm()
recognize_sphinx()
recognize_wit()

But the only free one with is recognize_google()


#pip3 install SpeechRecognition moviepy
import speech_recognition as sr 
import moviepy.editor as mp


clip = mp.VideoFileClip(r"c:/images/intro1.mp4") 
 
clip.audio.write_audiofile(r"c:/images/converted.wav")

r = sr.Recognizer()
r.energy_threshold = 300   #detect silent

audio = sr.AudioFile("c:/images/converted.wav")

with audio as source:
  r.adjust_for_ambient_noise(source, duration=0.5) #remove background noise
  audio_file = r.record(source)

"""
recognize_bing(): Microsoft Bing Speech
recognize_google(): Google Web Speech API, free, working with audio files under 5 minutes,limited to only 50 requests per day
recognize_google_cloud(): Google Cloud Speech - requires installation of the google-cloud-speech package
recognize_houndify(): Houndify by SoundHound
recognize_ibm(): IBM Speech to Text
recognize_sphinx(): CMU Sphinx - requires installing PocketSphinx
recognize_wit(): Wit.ai
"""

result = r.recognize_google(audio_file)
print(result)


If you want to use the commercial google API, just download google json credentials and use the next code


#pip3 install SpeechRecognition moviepy
#pip3 install --upgrade google-cloud-speech
import speech_recognition as sr 
import moviepy.editor as mp
import os

def GoogleConnection():
    dir = os.path.dirname(__file__)
    GOOGLE_APPLICATION_CREDENTIALS_file = os.path.join(dir, 'xxxxxx.json')
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=GOOGLE_APPLICATION_CREDENTIALS_file


clip = mp.VideoFileClip(r"c:/images/intro1.mp4") 
 
clip.audio.write_audiofile(r"c:/images/converted.wav")

r = sr.Recognizer()
r.energy_threshold = 300   #detect silent

audio = sr.AudioFile("c:/images/converted.wav")

with audio as source:
  r.adjust_for_ambient_noise(source, duration=0.5) #remove background noise
  audio_file = r.record(source)


GoogleConnection()
result = r.recognize_google_cloud(audio_file)
print(result)


  

and if we have a sound file ready on google firebase storage, we can process it using the next code


# Imports the Google Cloud client library
from google.cloud import speech
import os

def GoogleConnection():
    dir = os.path.dirname(__file__)
    GOOGLE_APPLICATION_CREDENTIALS_file = os.path.join(dir, 'xxxx.json')
    os.environ["GOOGLE_APPLICATION_CREDENTIALS"]=GOOGLE_APPLICATION_CREDENTIALS_file

GoogleConnection()
# Instantiates a client
client = speech.SpeechClient()

# The name of the audio file to transcribe
gcs_uri = "gs://xxx.appspot.com/Video/converted.wav"

audio = speech.RecognitionAudio(uri=gcs_uri)

config = speech.RecognitionConfig(
    encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
    #sample_rate_hertz=44100,
    audio_channel_count=2,
    language_code="en-US",
)

# Detects speech in the audio file
response = client.recognize(config=config, audio=audio)

for result in response.results:
    print("Transcript: {}".format(result.alternatives[0].transcript))





No comments: