SSML specification


https://www.w3.org/TR/speech-synthesis11/ 

Supported SSML Tags


<speak>
Supported attributes:
- version: 1.1
- m_bpm: Beats per minute, or BPM, is a term for measuring the tempo of a piece of music (default is 120)

- m_time_signature: For example, a time signature of 4/4 indicates that a quarter note (1/4) is one full beat, and that there are 4 beats in each measure of the song (default is 4/4)


<voice>
Supported attribute:
- name: letters
m (male) or f (female) followed by the number of the voice ex. m1, f10

<break>
Supported attribute:
- time: decimal number followed by
s for seconds or ms for milliseconds or b for the beats number


<phoneme>
Supported attributes:
- alphabet:
x-sampa (Extended Speech Assessment Methods Phonetic Alphabet)
- ph: you can use
https://melobytes.com/en/app/text2phonetics app to convert a plain text to X-SAMPA phonetic pronunciation

<prosody>
Supported attributes:
- duration: decimal number followed by
s for seconds or ms for milliseconds or b for the beats number ex. 0.25s, 250ms, 0.5b (for a time signature=4/4 and BPM=120 a beat (1/4) has duration 60/120 = 0.5 sec so 0.5b=0.5*0.5=0.25s) ( 0.5b actually represents a 1/8 note).

- volume: number from 0 to 100 followed by the % symbol ex. 50%
- pitch: number followed by
Hz or the note number ex. 220Hz, C4, C#4, D4, D#4, E4, F4, F#4, G4, G#4, A4, A#4, B4 etc.

General rule:
A simple text (syllable) without phonetic pronunciation ending with the character "-" means that it is in the same word with the next syllable.

Example 1:

<speak version="1.1">
        <break time=".652s"/>
        <prosody duration=".194s" pitch="D3">hap-</prosody>
        <break time=".037s"/>
        <prosody duration=".082s" pitch="D3">py</prosody>
        <break time=".013s"/>
        <prosody duration=".313s" pitch="E3">birth-</prosody>
        <break time=".013s"/>
        <prosody duration=".315s" pitch="C#3">day</prosody>
        <break time=".012s"/>
        <prosody duration=".319s" pitch="G3">to</prosody>
        <break time=".007s"/>
        <prosody duration=".32s" pitch="F#3">you</prosody>
        <break time=".332s"/>
        <prosody duration=".207s" pitch="D3">hap-</prosody>
        <break time=".008s"/>
        <prosody duration=".101s" pitch="D3">py</prosody>
        <break time=".01s"/>
        <prosody duration=".304s" pitch="E3">birth-</prosody>
        <break time=".022s"/>
        <prosody duration=".304s" pitch="D3">day</prosody>
        <break time=".022s"/>
        <prosody duration=".304s" pitch="A3">to</prosody>
        <break time=".022s"/>
        <prosody duration=".304s" pitch="G3">you</prosody>
        <break time=".348s"/>
        <prosody duration=".202s" pitch="D3">hap-</prosody>
        <break time=".007s"/>
        <prosody duration=".109s" pitch="D3">py</prosody>
        <break time=".008s"/>
        <prosody duration=".307s" pitch="D4">birth-</prosody>
        <break time=".019s"/>
        <prosody duration=".3s" pitch="B4">day</prosody>
        <break time=".026s"/>
        <prosody duration=".312s" pitch="G3">dear</prosody>
        <break time=".014s"/>
        <prosody duration=".308s" pitch="F#3">George</prosody>
        <break time=".018s"/>
        <break time=".304s"/>
        <break time=".022s"/>
        <prosody duration=".182s" pitch="C4">hap-</prosody>
        <break time=".021s"/>
        <prosody duration=".08s" pitch="C4">py</prosody>
        <break time=".043s"/>
        <prosody duration=".287s" pitch="B4">birth-</prosody>
        <break time=".039s"/>
        <prosody duration=".287s" pitch="G3">day</prosody>
        <break time=".039s"/>
        <prosody duration=".285s" pitch="A3">to</prosody>
        <break time=".041s"/>
        <prosody duration=".284s" pitch="G3">you</prosody>
        <break time=".368s"/>
        <prosody duration=".284s" pitch="D3">from</prosody>
        <break time=".042s"/>
        <prosody duration=".31s" pitch="E3">good</prosody>
        <break time=".016s"/>
        <prosody duration=".299s" pitch="D3">friends</prosody>
        <break time=".027s"/>
        <prosody duration=".303s" pitch="G3">and</prosody>
        <break time=".023s"/>
        <prosody duration=".304s" pitch="F#3">true</prosody>
        <break time=".348s"/>
        <prosody duration=".279s" pitch="D3">from</prosody>
        <break time=".047s"/>
        <prosody duration=".29s" pitch="E3">old</prosody>
        <break time=".036s"/>
        <prosody duration=".285s" pitch="D3">friends</prosody>
        <break time=".041s"/>
        <prosody duration=".283s" pitch="A3">and</prosody>
        <break time=".043s"/>
        <prosody duration=".284s" pitch="G3">new</prosody>
        <break time=".368s"/>
        <prosody duration=".182s" pitch="D3">may</prosody>
        <break time=".029s"/>
        <prosody duration=".08s" pitch="D3">good</prosody>
        <break time=".035s"/>
        <prosody duration=".293s" pitch="D4">luck</prosody>
        <break time=".033s"/>
        <prosody duration=".285s" pitch="B4">go</prosody>
        <break time=".041s"/>
        <prosody duration=".284s" pitch="G3">with</prosody>
        <break time=".042s"/>
        <prosody duration=".597s" pitch="F#3">you</prosody>
        <break time=".055s"/>
        <prosody duration=".288s" pitch="C4">and</prosody>
        <break time=".038s"/>
        <prosody duration=".29s" pitch="B4">hap-</prosody>
        <break time=".036s"/>
        <prosody duration=".29s" pitch="G3">pi-</prosody>
        <break time=".036s"/>
        <prosody duration=".289s" pitch="A3">ness</prosody>
        <break time=".037s"/>
        <prosody duration=".61s" pitch="G3">too</prosody>
</speak>


Example 2:

<speak version="1.1">
        <break time="1.5s"/>
        <prosody duration=".445s" pitch="C3"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>
        <break time=".086s"/>
        <prosody duration=".189s" pitch="C3"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>
        <break time=".03s"/>
        <prosody duration=".72s" pitch="D3"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>
        <break time=".03s"/>
        <prosody duration=".723s" pitch="C3"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>
        <break time=".027s"/>
        <prosody duration=".733s" pitch="F3"><phoneme alphabet="x-sampa" ph="tu:"></phoneme></prosody>
        <break time=".017s"/>
        <prosody duration=".736s" pitch="E3"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>
        <break time=".764s"/>
        <prosody duration=".475s" pitch="C3"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>
        <break time=".019s"/>
        <prosody duration=".233s" pitch="C3"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>
        <break time=".023s"/>
        <prosody duration=".7s" pitch="D3"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>
        <break time=".05s"/>
        <prosody duration=".7s" pitch="C3"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>
        <break time=".05s"/>
        <prosody duration=".7s" pitch="G3"><phoneme alphabet="x-sampa" ph="tu:"></phoneme></prosody>
        <break time=".05s"/>
        <prosody duration=".7s" pitch="F3"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>
        <break time=".8s"/>
        <prosody duration=".466s" pitch="C3"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>
        <break time=".016s"/>
        <prosody duration=".25s" pitch="C3"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>
        <break time=".019s"/>
        <prosody duration=".706s" pitch="C4"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>
        <break time=".044s"/>
        <prosody duration=".691s" pitch="A3"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>
        <break time=".059s"/>
        <prosody duration=".717s" pitch="F3"><phoneme alphabet="x-sampa" ph="di@3"></phoneme></prosody>
        <break time=".033s"/>
        <prosody duration=".708s" pitch="E3"><phoneme alphabet="x-sampa" ph="'skw0S"></phoneme></prosody>
        <break time=".042s"/>
        <break time=".7s"/>
        <break time=".05s"/>
        <prosody duration=".419s" pitch="A#3"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>
        <break time=".048s"/>
        <prosody duration=".184s" pitch="A#3"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>
        <break time=".098s"/>
        <prosody duration=".659s" pitch="A3"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>
        <break time=".091s"/>
        <prosody duration=".661s" pitch="F3"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>
        <break time=".089s"/>
        <prosody duration=".656s" pitch="G3"><phoneme alphabet="x-sampa" ph="tu:"></phoneme></prosody>
        <break time=".094s"/>
        <prosody duration=".653s" pitch="F3"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>
        <break time=".847s"/>
        <prosody duration=".653s" pitch="C3"><phoneme alphabet="x-sampa" ph="'fr0m"></phoneme></prosody>
        <break time=".097s"/>
        <prosody duration=".714s" pitch="D3"><phoneme alphabet="x-sampa" ph="gUd"></phoneme></prosody>
        <break time=".036s"/>
        <prosody duration=".688s" pitch="C3"><phoneme alphabet="x-sampa" ph="'frEndz"></phoneme></prosody>
        <break time=".062s"/>
        <prosody duration=".697s" pitch="F3"><phoneme alphabet="x-sampa" ph="and"></phoneme></prosody>
        <break time=".053s"/>
        <prosody duration=".7s" pitch="E3"><phoneme alphabet="x-sampa" ph="tru:"></phoneme></prosody>
        <break time=".8s"/>
        <prosody duration=".642s" pitch="C3"><phoneme alphabet="x-sampa" ph="'fr0m"></phoneme></prosody>
        <break time=".108s"/>
        <prosody duration=".667s" pitch="D3"><phoneme alphabet="x-sampa" ph="'oUld"></phoneme></prosody>
        <break time=".083s"/>
        <prosody duration=".655s" pitch="C3"><phoneme alphabet="x-sampa" ph="'frEndz"></phoneme></prosody>
        <break time=".095s"/>
        <prosody duration=".65s" pitch="G3"><phoneme alphabet="x-sampa" ph="and"></phoneme></prosody>
        <break time=".1s"/>
        <prosody duration=".653s" pitch="F3"><phoneme alphabet="x-sampa" ph="'nu:"></phoneme></prosody>
        <break time=".847s"/>
        <prosody duration=".419s" pitch="C3"><phoneme alphabet="x-sampa" ph="'meI"></phoneme></prosody>
        <break time=".067s"/>
        <prosody duration=".184s" pitch="C3"><phoneme alphabet="x-sampa" ph="gUd"></phoneme></prosody>
        <break time=".08s"/>
        <prosody duration=".673s" pitch="C4"><phoneme alphabet="x-sampa" ph="lVk"></phoneme></prosody>
        <break time=".077s"/>
        <prosody duration=".656s" pitch="A3"><phoneme alphabet="x-sampa" ph="'goU"></phoneme></prosody>
        <break time=".094s"/>
        <prosody duration=".653s" pitch="F3"><phoneme alphabet="x-sampa" ph="wID"></phoneme></prosody>
        <break time=".097s"/>
        <prosody duration="1.373s" pitch="E3"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>
        <break time="2.377s"/>
        <prosody duration=".662s" pitch="A#3"><phoneme alphabet="x-sampa" ph="and"></phoneme></prosody>
        <break time=".088s"/>
        <prosody duration=".667s" pitch="A3"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>
        <break time=".083s"/>
        <prosody duration=".667s" pitch="F3"><phoneme alphabet="x-sampa" ph="In"></phoneme></prosody>
        <break time=".083s"/>
        <prosody duration=".664s" pitch="G3"><phoneme alphabet="x-sampa" ph="@s"></phoneme></prosody>
        <break time=".086s"/>
        <prosody duration="1.403s" pitch="F3"><phoneme alphabet="x-sampa" ph="'tu:"></phoneme></prosody>
</speak>

Example 3:

<speak version="1.1" m_bpm="95" m_time_signature="3/4">

<break time="1.992b"/>

        <prosody duration=".604b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>

        <break time=".062b"/>

        <prosody duration=".329b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>

        <break time=".004b"/>

        <prosody duration=".904b" volume="100%" pitch="E4"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>

        <break time=".104b"/>

        <prosody duration=".904b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>

        <break time=".104b"/>

        <prosody duration=".896b" volume="100%" pitch="G4"><phoneme alphabet="x-sampa" ph="tu:"></phoneme></prosody>

        <break time=".088b"/>

        <prosody duration="1.829b" volume="100%" pitch="F#4"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>

        <break time=".162b"/>

        <break time=".025b"/>

        <prosody duration=".588b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>

        <break time=".079b"/>

        <prosody duration=".288b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>

        <break time=".046b"/>

        <prosody duration=".821b" volume="100%" pitch="E4"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>

        <break time=".162b"/>

        <prosody duration=".904b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>

        <break time=".096b"/>

        <prosody duration=".904b" volume="100%" pitch="A4"><phoneme alphabet="x-sampa" ph="tu:"></phoneme></prosody>

        <break time=".096b"/>

        <prosody duration="1.829b" volume="100%" pitch="G4"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>

        <break time=".162b"/>

        <break time=".025b"/>

        <prosody duration=".588b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>

        <break time=".079b"/>

        <prosody duration=".288b" volume="100%" pitch="D4"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>

        <break time=".046b"/>

        <prosody duration=".896b" volume="100%" pitch="D5"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>

        <break time=".088b"/>

        <prosody duration=".904b" volume="100%" pitch="B5"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>

        <break time=".096b"/>

        <prosody duration=".904b" volume="100%" pitch="G4"><phoneme alphabet="x-sampa" ph="di@3"></phoneme></prosody>

        <break time=".096b"/>

        <prosody duration=".904b" volume="100%" pitch="F#4"><phoneme alphabet="x-sampa" ph="'dA@r"></phoneme></prosody>

        <break time=".096b"/>

        <prosody duration="3.671b" volume="100%" pitch="E4"><phoneme alphabet="x-sampa" ph="Il"></phoneme></prosody>

        <break time=".254b"/>

        <break time=".075b"/>

        <prosody duration=".604b" volume="50%" pitch="C5"><phoneme alphabet="x-sampa" ph="'hap"></phoneme></prosody>

        <break time=".062b"/>

        <prosody duration=".304b" volume="50%" pitch="C5"><phoneme alphabet="x-sampa" ph="i"></phoneme></prosody>

        <break time=".046b"/>

        <prosody duration=".896b" volume="50%" pitch="B5"><phoneme alphabet="x-sampa" ph="'b3:Td"></phoneme></prosody>

        <break time=".104b"/>

        <prosody duration=".896b" volume="50%" pitch="G4"><phoneme alphabet="x-sampa" ph="eI"></phoneme></prosody>

        <break time=".104b"/>

        <prosody duration=".898b" volume="50%" pitch="A4"><phoneme alphabet="x-sampa" ph="tu:"></phoneme></prosody>

        <break time=".102b"/>

        <prosody duration="3.069b" volume="50%" pitch="G4"><phoneme alphabet="x-sampa" ph="ju:"></phoneme></prosody>

        <break time=".223b"/>

</speak>