#!/usr/bin/python # E-Mac-Speak # Emacspeak on Mac. # author: David Tseng # author: Bart Bunting import aifc import math import mmap import os import re import StringIO import subprocess import sys import tempfile import threading from Foundation import * from PyObjCTools import AppHelper from AppKit import NSObject from AppKit import NSSound from AppKit import NSSpeechSynthesizer from AppKit import NSURL # Globals. enableSox = False try: import pysox enableSox &= True except: enableSox = False ttsState = {} # amount to increase pitch of capital letters. This may need tweaking. capsPitchIncrease = 8 # Debug level # 0 means no debug output is written 10 is maximum output debugLevel = 0 # Should debug output also be written to STDOUT? debugToSTDOUT = 1 # A debug file to place sample tts text sent by emacspeak. if debugLevel > 0: DEBUGFILE = tempfile.NamedTemporaryFile(dir='/tmp', mode='w', prefix='emacspeak.servers.mac_log') # Insecure version; used for easier debugging. DEBUGFILE = open('/tmp/speech.log', 'w') # global speech rate ttsState['speechRate'] = 500 ttsState['charFactor'] = 1.8 ttsState['sayRate'] = round(ttsState['speechRate'] * ttsState['charFactor']) # set capitalize to off initialy ttsState['capitalize'] = 0 # Possible levels of punctuation (none, some, all). ttsState['punctuationLevel'] = 'none' # used to pronounce punctuation chars punctuationNamesMap= {'$':'dollar', '#':'pound', '-':'dash', '"':'quote', '(':'leftParen', ')':'rightParen', '*':'star', ';':'semi', ':':'colon', '<':'less than', '>':'greater than', '\n':'', '\\':'backslash', '/':'slash', '+':'plus', '=':'equals', '\'':'apostrophe', '*':'star', '~':'tilda', '`':'backquote', '!':'exclamation', '@':'at', '_':'underline', '\'':'apostrophe', '.':'dot', ',':'comma'} punctuationMap = {'none': ('$'), 'some': ('$', '#', '-', '"', '(', ')', '*', ';', ':', '<', '>', '\n', '\\', '/', '+', '=', '~', '`', '!'), 'all': ('$', '#', '-', '"', '(', ')', '*', ';', ':', '<', '>', '\n', '\\', '/', '+', '=', '\'', '~', '`', '!', '@', '_', '\'', '*', '.', ',')} # Blacklist these characters for audio clip rendering as they cause crashes. renderingBlacklist = ['*', '(', ')', '\n', '/'] # Do not filter on some or none punctuation. literalPunctuation = ["'", ","] # map voice short names to apple class names. voiceMap= {'alex':'com.apple.speech.synthesis.voice.Alex', 'victoria':'com.apple.speech.synthesis.voice.Victoria', 'vicki':'com.apple.speech.synthesis.voice.Vicki', 'ralf':'com.apple.speech.synthesis.voice.Ralph', 'kathy':'com.apple.speech.synthesis.voice.Kathy', 'junior':'com.apple.speech.synthesis.voice.Junior', 'fred':'com.apple.speech.synthesis.voice.Fred', 'tracy':'com.acapelagroup.AGix.voice.Tracy22k_HQ', 'daniel':'com.apple.speech.synthesis.voice.daniel.premium', 'emily':'com.apple.speech.synthesis.voice.emily.premium', 'fiona':'com.apple.speech.synthesis.voice.fiona.premium', 'karen':'com.apple.speech.synthesis.voice.karen.premium', 'lee':'com.apple.speech.synthesis.voice.lee.premium', 'moira':'com.apple.speech.synthesis.voice.moira.premium', 'sangeeta':'com.apple.speech.synthesis.voice.sangeeta.premium', 'tessa':'com.apple.speech.synthesis.voice.tessa.premium', 'tom':'com.apple.speech.synthesis.voice.tom.premium', 'bruce':'com.apple.speech.synthesis.voice.Bruce'} # A queue of either text or audio filenames waiting to be dispatched to NSSpeechSynthesizer. speechQueue = [] # Lock access to specific speech synthesis paths. speechLock = threading.Lock() """ Helper for consuming callbacks from NSSound and NSSpeechSynthesizer. Implements NSSoundDelegate and NSSpeechSynthesizerDelegate protocol's. """ class ServerDelegate(NSObject): # NSSoundDelegate implementation. def sound_didFinishPlaying_(self, textClipPlayer, success): processSpeechQueue() # NSSpeechSynthesizerDelegate implementation. def speechSynthesizer_didFinishSpeaking_(self, synthesizer, success): processSpeechQueue() def speechSynthesizer_didEncounterErrorAtIndex_ofString_message_(self, synthesizer, index, text, message): #writeDebugLog(2, "error encountered in synth: " + message + " at index " + str(index) + " of string '" + text + "'\n") writeDebugLog(2, "error encountered in synth: index " + str(index) + " of string '" + text + "'\n") def speechSynthesizer_didEncounterSyncMessage_(self, synthesizer, message): #writeDebugLog(2, "sync " + int(message) + "\n") writeDebugLog(2, "sync\n") p = ProtocolHandler() p.handleTone("500 30") # Static. speechSynthesizer = NSSpeechSynthesizer.alloc().init() speechDelegate = ServerDelegate.alloc().init() speechSynthesizer.setRate_(ttsState['speechRate']) speechSynthesizer.setDelegate_(speechDelegate) textClipPlayer = NSSound.alloc().retain() textClipPlayerDelegate = ServerDelegate.alloc().init() textClipPlayer.setDelegate_(textClipPlayerDelegate) open("/tmp/speechOut.aiff", "w").write("0") open("/tmp/soxSpeechOut.aiff", "w").write("0") speechOutPathString = '/tmp/speechOut.aiff' speechOutUrl = NSURL.alloc().initWithString_(speechOutPathString) speechOutMapFile = mmap.mmap(os.open(speechOutPathString, os.O_RDWR), 0) soxSpeechOutMapFile = mmap.mmap(os.open('/tmp/soxSpeechOut.aiff', os.O_RDWR), 0) """ An object which observes stdin. This class largely contains basic i/o for reading from stdin using NSNotificationCenter. """ class FileObserver(NSObject): def initWithFileDescriptor_readCallback_errorCallback_(self, fileDescriptor, readCallback, errorCallback): self = self.init() self.readCallback = readCallback self.errorCallback = errorCallback self.fileHandle = NSFileHandle.alloc().initWithFileDescriptor_( fileDescriptor) self.nc = NSNotificationCenter.defaultCenter() self.protocolHandler = ProtocolHandler() self.nc.addObserver_selector_name_object_( self, 'fileHandleReadCompleted:', NSFileHandleReadCompletionNotification, self.fileHandle) self.fileHandle.readInBackgroundAndNotify() return self def fileHandleReadCompleted_(self, aNotification): ui = aNotification.userInfo() newData = ui.objectForKey_(NSFileHandleNotificationDataItem) if newData is None: if self.errorCallback is not None: self.errorCallback(self, ui.objectForKey_(NSFileHandleError)) self.close() else: self.fileHandle.readInBackgroundAndNotify() if self.readCallback is not None: self.readCallback(self, str(newData)) def close(self): self.nc.removeObserver_(self) if self.fileHandle is not None: self.fileHandle.closeFile() self.fileHandle = None # break cycles in case these functions are closed over # an instance of us self.readCallback = None self.errorCallback = None def __del__(self): # Without this, if a notification fires after we are GC'ed # then the app will crash because NSNotificationCenter # doesn't retain observers. In this example, it doesn't # matter, but it's worth pointing out. self.close() # This is conceptually the main entry point of the server. def gotLine(observer, aLine): if aLine: writeDebugLog(2, "gotline: " + aLine.rstrip() + "\nend\n") observer.protocolHandler.dispatchRawTtsMessage(aLine.rstrip()) else: AppHelper.stopEventLoop() def gotError(observer, err): print "error:", err AppHelper.stopEventLoop() def writeDebugLog(level, output): # do nothing if debug is off if debugLevel == 0: return if level <= debugLevel: if output: DEBUGFILE.write("\n" + output) # Write debug messages to STDOUT if requested as well if debugToSTDOUT: print output + "\n" DEBUGFILE.flush() # Object that implements the Emacspeak TTS protocol. class _ProtocolHandler: # Enables singleton behavior. def __call__(self): return self def __init__(self): # Maps a protocol id to a handler function. self.protocolIdToHandlerMap = { 'd':self.handleDispatch, 'a':self.handleAuditoryIcon, 'l':self.handleLetter, 'c':self.handleQueue, 'q':self.handleQueue, 's':self.handleStopSpeaking, 't':self.handleTone, 'tts_say':self.handleTtsSay, 'tts_selftest':self.handleTtsSelftest, 'tts_sync_state':self.handleTtsSyncState, 'tts_set_punctuations':self.handleTtsSetPunctuations, 'tts_set_speech_rate':self.handleTtsSetSpeechRate, 'tts_set_character_scale':self.handleTtsSetCharacterScale } # Used to extract protocol id and args from a raw message. self.protocolRePattern = ( r"((?P[a-z\-]*) {(?P[\s\S]*))|((?P[a-z_]*) (?P[\s\S]*))|(?P[a-z_]*)") # Used to detect multiple dispatches during processing of one chunk set. self.isProcessing = False # Protocol Handlers. def handleDispatch(self, args): # sometimes we receive multiple dispatches within the same chunks set. if not self.isProcessing: processSpeechQueue() self.isProcessing = True # Play an auditory icon. # There may be a better way of doing this # Currently we play the icon as soon as we receive it which produces the best results. def handleAuditoryIcon(self, args): writeDebugLog(6, "Auditory icon " + args + "\n") if args: NSSound.alloc().initWithContentsOfFile_byReference_(args, True).play() def handleLetter(self, args): output = args.strip(" }") prefix = "[[rate " + str(ttsState['sayRate']) + "]] [[char ltrl]] " suffix = "[[rate " + str(ttsState['speechRate']) + "]] [[char norm]]" writeDebugLog(4, "Letter: " + output + "\n") self.handleStopSpeaking(None) # Check if this is a capital if re.match(r"^[A-Z]+$", output): prefix = prefix + " [[pbas +" + str(capsPitchIncrease) + "]]" suffix = "[[pbas -" + str(capsPitchIncrease) + "]] " + suffix # add prefix and suffix to output output = prefix + " " + output + " " + suffix writeDebugLog(4, "about to speak letter: " + output + "\n") # Send directly to TTS to avoid any line processing. speechSynthesizer.startSpeakingString_(output) def handleQueue(self, args): preprocessCustomEmbeds(args) def handleStopSpeaking(self, args): speechSynthesizer.stopSpeaking() textClipPlayer.stop() del(speechQueue[:]) def handleTone(self, args): frequency, length = args.split(' ') frequency = int(frequency) length = int(length) if not frequency or not length: writeDebugLog(4, 'Invalid tone args %i, %i' % (frequency, length)) return frameRate = 44100 nframes = frameRate * length / 1000. frequencyWave = 2 * math.pi * frequency / frameRate # frequencyWave is measured in radians per sample f = StringIO.StringIO() e = aifc.open(f, 'w') e.setnchannels(1) e.setsampwidth(2) e.setframerate(44100) e.setnframes(int(nframes)) e.writeframesraw(self.buildSineWave(frequencyWave, int(nframes))) soundData = NSData.alloc().initWithBytes_length_(f.getvalue(), len(f.getvalue())) icon = NSSound.alloc().initWithData_(soundData) icon.play() e.close() def handleTtsSay(self, args): self.handleStopSpeaking(None) speechSynthesizer.startSpeakingString_('[[rate %i]] %s' % (ttsState['sayRate'], args)) def handleTtsSelftest(self, args): samples = ["phaser 0.8 0.74 3.0 0.4 0.5 -t", "phaser 0.6 0.66 3.0 0.6 2.0 -t", "phaser 0.6 0.66 3.0 0.6 2.0 -t", "echos 0.8 0.7 700.0 0.25 900.0 0.3", "echo 0.8 0.9 1000.0 0.3 1800.0 0.25", "chorus 0.6 0.9 50.0 0.4 0.25 2.0 -t 60.0 0.32 0.4 1.3 -s", "chorus 0.6 0.9 50.0 0.4 0.25 2.0 -t 60.0 0.32 0.4 1.3 -s", "pan -1", "pan -.5", "pan .5", "pan 1", "tremolo 10 90", "tremolo 1000 60", "reverb 100 50 5 0", "reverb 100 100 100 0", "reverb 50 30 50 0", "chorus 0.5 0.9 50.0 0.4 0.25 2.0 -t 60.0 0.32 0.4 2.3 -t"] i = len(samples) - 1 while i >= 0: self.handleQueue("[{" + samples[i] + "}]" + "[[rate 200]] This is a test at index " + str(i) + "}") i -= 1 self.handleDispatch(args) def handleTtsSetPunctuations(self, args): global ttsState ttsState['punctuationLevel'] = args.strip() writeDebugLog(4, "Setting punctuation level: " + ttsState['punctuationLevel']) def handleTtsSetSpeechRate(self, args): global ttsState ttsState['speechRate'] = int(args.strip()) ttsState['sayRate'] = round(ttsState['speechRate'] * ttsState['charFactor']) writeDebugLog(4, "Setting speech rate: " + str(ttsState['speechRate']) + " char factor: " + str(ttsState['charFactor']) + " say rate: " + str(ttsState['sayRate']) + "\n") def handleTtsSetCharacterScale(self, args): global ttsState ttsState['charFactor'] = float(args.strip()) ttsState['sayRate'] = round(ttsState['speechRate'] * ttsState['charFactor']) writeDebugLog(4, "Setting character scale: " + str(ttsState['charFactor']) + " say rate: " + str(ttsState['sayRate']) + "\n") def handleTtsSyncState(self, args): params = args.split() global ttsState ttsState['punctuationLevel'] = params[0] ttsState['capitalize'] = int(params[1]) ttsState['allCaps'] = int(params[2]) ttsState['splitCaps'] = int(params[3]) ttsState['speechRate'] = int(params[4]) writeDebugLog(6, "tts_sync: set punctuation level to: " + ttsState['punctuationLevel'] + " capitalize to " + str(ttsState['capitalize']) + " allcaps " + str(ttsState['allCaps']) + " splitCaps " + str(ttsState['splitCaps']) + " rate to: " + str(ttsState['speechRate']) + "\n") def dispatchRawTtsMessage(self, message): self.isProcessing = False writeDebugLog(4, "dispatchRawTtsMessage") chunks = message.split('\n') for chunk in chunks: writeDebugLog(4, "\nchunk: " + chunk + "\nend\n") if not chunk: continue # Extract protocol id and protocol args. matcher = re.match(self.protocolRePattern, chunk) # Message validation. if not matcher or ( len(matcher.groups()) != 7): writeDebugLog(4, "Unable to parse pattern " + chunk) continue # Note the presence of three types of protocol messages: # 1. containing only id. # 2. containing id and {...} block. # 3. containing id and space delimited args. id = matcher.group('id') blockId = matcher.group('blockId') blockArg = matcher.group('blockArg') spaceId = matcher.group('spaceId') spaceArg = matcher.group('spaceArg') # Send off to the handler. if id and self.protocolIdToHandlerMap.has_key(id): self.protocolIdToHandlerMap[id](None) elif ( blockId and self.protocolIdToHandlerMap.has_key(blockId)): self.protocolIdToHandlerMap[blockId](blockArg) elif ( spaceId and spaceArg and self.protocolIdToHandlerMap.has_key(spaceId)): self.protocolIdToHandlerMap[spaceId](spaceArg) else: print "unable to parse" writeDebugLog( 4, "Error! unsupported message id:%s, blockId:%s, blockArg:%s, spaceId:%s, spaceArg:%s" % ( id, blockId, blockArg, spaceId, spaceArg)) def buildSineWave(self, frequency, length): sinewave = bytearray() for i in range(length): val = int(math.sin(i * frequency) * 20000) sinewave.extend([(val >> 8) & 255, val & 255]) return bytes(sinewave) #TODO: support protocol commands. #version": #speak tts version #tts_pause: #if tts_resume: #sh: #silence for ms. #tts__reset: #tts_allcaps_beep flag (beep) # Static. ProtocolHandler = _ProtocolHandler() # This takes a string with embedded voice changes and # splits it up and queues individual text events with voice changes inbetween. def preprocessCustomEmbeds(text): writeDebugLog(4, "preprocess custom embeds.") effects = [] text = text.strip(' {}') # The following uses a car/cdr pattern. # Since we don't know if the car is a text or embedded command, we try both. while text: matcher = re.match(r"(?P(^(.|\n)*?))\[\{(?P[a-zA-Z0-9 \.-]*?)\}\](?P(.|\n)*?)$", text) if not matcher: writeDebugLog(8, "no match in string " + text + "\n") break first = matcher.group('first') embed = matcher.group('customembed') rest = matcher.group('rest') if first and len(effects) > 0: speechQueue.append(("textclip", (effects, first))) elif first: while len(first) > 1024: speechQueue.append(('text', first[:1024])) first = first[1024:] speechQueue.append(('text', first)) writeDebugLog( 8, "first: " + str(first) + "\neffects " + str(effects) + "\nrest " + str(rest)) if embed: embedArgs = embed.split(' ') # Special case voice changes. if embedArgs[0] == 'voice': writeDebugLog(4, 'queueing voice change: ' + str(embedArgs)) speechQueue.append(('voice', embedArgs[1])) else: effects.append(embedArgs) if rest: text = rest else: text = "" if text: writeDebugLog(8, "queueing final chunk " + text) if len(effects) > 0: writeDebugLog(8, "Appending textclip" + str(effects)) speechQueue.append(("textclip", (effects, text))) else: writeDebugLog(8, "Appending text") while len(text) > 1024: speechQueue.append(("text", text[:1024])) text = text[1024:] speechQueue.append(("text", text)) # Processes speech objects. # format is: # { type, value } def processSpeechQueue(): # hacky way to work around buggy NSSpeechSynthesizerDelegate speechLock.acquire() if NSSpeechSynthesizer.isAnyApplicationSpeaking() or textClipPlayer.isPlaying(): speechLock.release() return speechLock.release() if (len(speechQueue) > 0): item = speechQueue.pop(0) #Text if item[0] == "text": #Basic processing. TODO: refactor. output = item[1].strip(" {}") if output: output = Clean(output) speechSynthesizer.setRate_(ttsState['speechRate']) speechSynthesizer.startSpeakingString_(output) writeDebugLog(2, "\nsay: " + item[1] + "\nend\n") else: processSpeechQueue() # Voice changes elif item[0] == "voice": writeDebugLog(4, "Voice change") voice = item[1] if voice in voiceMap: speechSynthesizer.setVoice_(voiceMap[voice]) writeDebugLog(6, "set voice to " + voiceMap[voice]) processSpeechQueue() # Custom embeds. elif item[0] == 'textclip': # This is totally repetitive with above. output = (item[1])[1] output = ProcessSpecialCharacters(output) output = output.strip(' ') if not output: processSpeechQueue() return writeDebugLog(4, "rendering text clip" + str(output)) speechSynthesizer.setRate_(ttsState['speechRate']) if not enableSox: speechSynthesizer.startSpeakingString_(output) else: if speechSynthesizer.startSpeakingString_toURL_( output, speechOutUrl) and output not in renderingBlacklist: speechQueue.insert(0, (('playclip', (item[1])[0]))) # Plays text clip (with possibly sox processing). elif item[0] == 'playclip': effectArgs = (item[1]).pop(0) writeDebugLog(4, "about to play clip with effects: " + str(effectArgs)) # Hack to get us proper stereo clips. subprocess.call(['sox', '/tmp/speechOut.aiff', '/tmp/speechOut.wav', 'channels', '2']) inStream = pysox.CSoxStream('/tmp/speechOut.wav') outStream = pysox.CSoxStream( '/tmp/soxSpeechOut.aiff', 'w', inStream.get_signal(), fileType='wav') chain = pysox.CEffectsChain(inStream, outStream) effectName = effectArgs.pop(0) chain.add_effect(pysox.CEffect(effectName, effectArgs)) chain.flow_effects() inStream.close() outStream.close() textClipPlayer.initWithContentsOfFile_byReference_( '/tmp/soxSpeechOut.aiff', False).play() def Clean(text): text = ProcessSpecialCharacters(text) text = ExpandNumbers(text) return text def ProcessSpecialCharacters(wordList): punctList = punctuationMap[ttsState['punctuationLevel']] # used to track if we are currently inside an embedded command. inEmbeddedCommand = 0 expansion = "" for i in range(len(wordList)): # Check if we are entering an embedded command if wordList[i] == '[' and wordList[i - 1] == '[': inEmbeddedCommand = 1 # Check if we are leaving an embedded command if wordList[i] == ']' and wordList[i - 1] == ']': inEmbeddedCommand = 0 # if we are in an embedded command then just send the char straight through with out processing if inEmbeddedCommand: expansion += (wordList[i]) continue # Check if we have an expansion for this char if wordList[i] in punctList: expansion += ' ' + (punctuationNamesMap[wordList[i]]) + ' ' continue elif wordList[i] in punctuationMap['all'] and not wordList[i] in literalPunctuation: expansion += wordList[i] + ' ' continue # if we have a cap letter if ttsState['capitalize'] and re.match(r"^[A-Z]+$", wordList[i]): expansion += "[[sync 3]]" expansion += (wordList[i]) return expansion #* Numbers to words def ExpandNumbers(text): writeDebugLog(4, "Expand numbers") expanded_text = "" while text: writeDebugLog(4, "expandNumbers: while loop: text " + text + " expanded " + expanded_text) matcher = re.match(r"^(?P.*?)(?P[0-9]+)(?P.*)$", text) if not matcher: writeDebugLog(8, "no match in string " + text + "\n") break first = matcher.group('first') number = matcher.group('number') rest = matcher.group('rest') if first.endswith(',') or rest.startswith(','): break expanded_text += first + " " + num2eng(number) + " " writeDebugLog(8, "first: " + first + "\nnumber " + number + "\nrest " + rest) text = rest if text: writeDebugLog(8, "appending final chunk " + text) expanded_text += text writeDebugLog(4, "ExpandNumbers: returning " + expanded_text) return expanded_text def num2eng(num): '''English representation of a number''' # No preprocessing necessary. if len(num) <= 6 or "," in num: return num else: processed = "" i = len(num) while (i >= 0): if i > 3: processed = "," + num[i - 3:i] + processed else: processed = num[0:i] + processed return processed i -= 3 #* Main def main(): speechSynthesizer.startSpeakingString_("E Mac Speak server") observer = FileObserver.alloc().initWithFileDescriptor_readCallback_errorCallback_( sys.stdin.fileno(), gotLine, gotError) try: AppHelper.runConsoleEventLoop() except KeyboardInterrupt: writeDebugLog(2, "\nKeyboard interrupt") except Exception, err: writeDebugLog(2, "Server crashed:%s" % err) if __name__ == '__main__': main() # local variables: # mode: python # end: