diff --git a/python-server/filesystem.py b/python-server/filesystem.py index 0ca7ac0..1c9d512 100644 --- a/python-server/filesystem.py +++ b/python-server/filesystem.py @@ -5,6 +5,8 @@ from pydub import AudioSegment def save_audio(filename, base64_string): decoded = None orig_filename = filename[:-4]+"_orig" + with open("latestBase64","wb") as f: + f.write(base64_string) try: decoded = base64.b64decode(base64_string) except TypeError: @@ -16,9 +18,10 @@ def save_audio(filename, base64_string): AudioSegment.from_file(orig_filename).export(filename,format="wav") return b"SUCCESS" -def save_audio_chain(filenames, base64_strings): - compleAudio = None - for fname in filenames: +def save_audio_chain(file_str_tupels): + completeAudio = None + for fname, base64_string in file_str_tupels: + print("Filename: {}".format(fname)) decoded = None orig_filename = fname[:-4]+"_orig" try: @@ -27,14 +30,32 @@ def save_audio_chain(filenames, base64_strings): return b"ERROR_INVALID_ENCODING_64" with open(orig_filename,"wb") as f: f.write(decoded) - if compleAudio == None: + if completeAudio == None: completeAudio = AudioSegment.from_file(orig_filename) else: - completeAudio += [AudioSegment.from_file(orig_filename)] - completeAudio.export(filenames[0],format="wav") + completeAudio += AudioSegment.from_file(orig_filename) + if not completeAudio: + return b"ERROR_AUDIO_CONCAT_FAILED" + else: + completeAudio.export(file_str_tupels[0][0],format="wav") + return b"SUCCESS" def save_transcript(filename, transcript): if os.path.isfile(filename): pass with open(filename + "_transcript","w") as f: f.write(transcript) + +def get_transcript(filename): + if os.path.isfile(filename): + with open(filename + "_transcript","r") as f: + return f.read() + +def filelist(): + return "" + +def fileinfo(filename): + return "" + +def copy_to_output(filename): + return "" diff --git a/python-server/server_interface.py b/python-server/server_interface.py index ece52e3..1ed7db7 100644 --- a/python-server/server_interface.py +++ b/python-server/server_interface.py @@ -5,15 +5,34 @@ MAIN_DIR = b"data/" def parse_request(data): ''' parse request and call correct function ''' - + #return b"DUMMY" + print(data.split(b",")[0]) # echo/test connection # cleared_data = is_data_type(b"ECHOREQUEST,",data) if cleared_data: return cleared_data + + # reply transcript # + cleared_data = is_data_type(b"GET_TRANSCRIPT,",data) + if cleared_data: + filename = data.decode("utf-8") + return filesystem.get_transcript().encode("utf-8") + + # get single file info # + cleared_data = is_data_type(b"GET_FILEINFO,",data) + if cleared_data: + filename = data.decode("utf-8") + return filesystem.fileinfo(filename).encode("utf-8") + + # get single file info # + cleared_data = is_data_type(b"GET_FILEINFO_ALL,",data) + if cleared_data: + return filesystem.filelist().encode("utf-8") # handle audio transmission # cleared_data = is_data_type(b"AUDIO_TRANSMISSION,",data) if cleared_data: + print("Handling audio transmission") filename = None try: filename, base64_string = cleared_data.split(b',') @@ -27,17 +46,20 @@ def parse_request(data): # handle a chain of audiotransmissions # if data.startswith(b"CHAIN_AUDIO_TRANSMISSION"): - files = [] - base64_strings = [] - for el in data.split(b"|"): + file_str_tuples = [] + arr = data.split(b"|") + for el in arr[1:-1]: filename, base64_string = el.split(b',') filename = MAIN_DIR + filename.split(b"/")[-1] + b".wav" filename = filename.decode("utf-8") - files += [filename] - base64_strings += [base64_string] - filesystem.save_audio_chain(files,base64_string); - speech.async_create_transcript(files[0]) - return b"SUCCESS" + file_str_tuples += [(filename,base64_string)] + + if len(file_str_tuples) < 2: # a chain has 2 or more elements + return bytes("ERROR_INVALID_NUMBER_FILES_{}".format(len(file_str_tuples)),"utf-8") + + ret = filesystem.save_audio_chain(file_str_tuples); + speech.async_create_transcript(file_str_tuples[0][0]) + return ret # other shit @@ -80,3 +102,4 @@ def recive_transcribe_request(audiofile): def android_unittest_transcribe_request(audiofile): ''' the android unittests append a special keyword, requests are dummy handled ''' + pass diff --git a/python-server/speech.py b/python-server/speech.py index 387bfce..397e7d6 100644 --- a/python-server/speech.py +++ b/python-server/speech.py @@ -4,7 +4,11 @@ import os.path import filesystem import log +USE_FREE=False +USE_PAID=True + def async_create_transcript(filename): + print("Creating transcript..") mp.Process(target=create_and_save_transcript,args=(filename,)).start() def create_and_save_transcript(filename): @@ -18,12 +22,21 @@ def analyse(filename): audio = recognizer.record(source) try: - string = recognizer.recognize_google(audio,language="de-DE") + if USE_FREE: + string = free_google_backend(recognizer, audio) + elif USE_PAID: + string = paid_google_backend(recognizer,audio) except spr.UnknownValueError: log.log("Audio file is broken or not an audio file") - return None + return "ERROR_AUDIO_FILE_INVALID" except spr.RequestError as e: log.log("Could not connect to google API: {}".format(e)) - return None + return "ERROR_API_FAILURE" return string + +def free_google_backend(recognizer, audio): + return recognizer.recognize_google(audio,language="de-DE") + +def paid_google_backend(recognizer, audio): + pass diff --git a/python-server/transcribe_async.py b/python-server/transcribe_async.py new file mode 100644 index 0000000..0a35005 --- /dev/null +++ b/python-server/transcribe_async.py @@ -0,0 +1,102 @@ +#!/usr/bin/env python + +# Copyright 2017 Google Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Google Cloud Speech API sample application using the REST API for async +batch processing. + +Example usage: + python transcribe_async.py resources/audio.raw + python transcribe_async.py gs://cloud-samples-tests/speech/vr.flac +""" + +import argparse +import io + + +# [START speech_transcribe_async] +def transcribe_file(speech_file): + """Transcribe the given audio file asynchronously.""" + from google.cloud import speech + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() + + # [START speech_python_migration_async_request] + with io.open(speech_file, 'rb') as audio_file: + content = audio_file.read() + + audio = types.RecognitionAudio(content=content) + config = types.RecognitionConfig( + encoding=enums.RecognitionConfig.AudioEncoding.LINEAR16, + sample_rate_hertz=16000, + language_code='en-US') + + # [START speech_python_migration_async_response] + operation = client.long_running_recognize(config, audio) + # [END speech_python_migration_async_request] + + print('Waiting for operation to complete...') + response = operation.result(timeout=90) + + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in response.results: + # The first alternative is the most likely one for this portion. + print(u'Transcript: {}'.format(result.alternatives[0].transcript)) + print('Confidence: {}'.format(result.alternatives[0].confidence)) + # [END speech_python_migration_async_response] +# [END speech_transcribe_async] + + +# [START speech_transcribe_async_gcs] +def transcribe_gcs(gcs_uri): + """Asynchronously transcribes the audio file specified by the gcs_uri.""" + from google.cloud import speech + from google.cloud.speech import enums + from google.cloud.speech import types + client = speech.SpeechClient() + + audio = types.RecognitionAudio(uri=gcs_uri) + config = types.RecognitionConfig( + #encoding=enums.RecognitionConfig.AudioEncoding.FLAC, + #sample_rate_hertz=16000, + language_code='de-DE') + + operation = client.long_running_recognize(config, audio) + + print('Waiting for operation to complete...') + response = operation.result(timeout=90) + + # Each result is for a consecutive portion of the audio. Iterate through + # them to get the transcripts for the entire audio file. + for result in response.results: + # The first alternative is the most likely one for this portion. + print(u'Transcript: {}'.format(result.alternatives[0].transcript)) + print('Confidence: {}'.format(result.alternatives[0].confidence)) +# [END speech_transcribe_async_gcs] + + +if __name__ == '__main__': + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument( + 'path', help='File or GCS path for audio file to be recognized') + args = parser.parse_args() + if args.path.startswith('gs://'): + transcribe_gcs(args.path) + else: + transcribe_file(args.path)