|
|
<!DOCTYPE html>
|
|
|
<html lang="en">
|
|
|
<meta charset="UTF-8" />
|
|
|
<title>VibeVoice-Realtime TTS Demo</title>
|
|
|
<style>
|
|
|
:root {
|
|
|
--bg: #f5f7fc;
|
|
|
--surface: #ffffff;
|
|
|
--accent: #5562ff;
|
|
|
--accent-strong: #3f4dff;
|
|
|
--text-primary: #1f2742;
|
|
|
--text-muted: #5d6789;
|
|
|
--border: rgba(85, 98, 255, 0.18);
|
|
|
--shadow: 0 18px 45px rgba(31, 39, 66, 0.08);
|
|
|
}
|
|
|
|
|
|
.helper-text {
|
|
|
font-size: 12px;
|
|
|
color: #8a93b5;
|
|
|
}
|
|
|
|
|
|
* {
|
|
|
box-sizing: border-box;
|
|
|
}
|
|
|
|
|
|
body {
|
|
|
margin: 0;
|
|
|
background: var(--bg);
|
|
|
font-family: 'Inter', 'Segoe UI', Roboto, Helvetica, sans-serif;
|
|
|
color: var(--text-primary);
|
|
|
display: flex;
|
|
|
justify-content: center;
|
|
|
padding: 48px 20px;
|
|
|
}
|
|
|
|
|
|
.app-shell {
|
|
|
width: min(960px, 100%);
|
|
|
background: var(--surface);
|
|
|
border-radius: 20px;
|
|
|
padding: 36px 40px 44px;
|
|
|
box-shadow: var(--shadow);
|
|
|
display: flex;
|
|
|
flex-direction: column;
|
|
|
gap: 28px;
|
|
|
}
|
|
|
|
|
|
h1 {
|
|
|
margin: 0;
|
|
|
text-align: center;
|
|
|
font-size: 30px;
|
|
|
font-weight: 700;
|
|
|
letter-spacing: 0.01em;
|
|
|
}
|
|
|
|
|
|
.panel {
|
|
|
display: flex;
|
|
|
flex-direction: column;
|
|
|
gap: 10px;
|
|
|
}
|
|
|
|
|
|
.field {
|
|
|
display: flex;
|
|
|
flex-direction: column;
|
|
|
gap: 8px;
|
|
|
}
|
|
|
|
|
|
.field-label {
|
|
|
font-weight: 600;
|
|
|
font-size: 15px;
|
|
|
color: var(--text-primary);
|
|
|
}
|
|
|
|
|
|
.text-input {
|
|
|
width: 100%;
|
|
|
min-height: 140px;
|
|
|
max-height: 240px;
|
|
|
border: 1px solid rgba(31, 39, 66, 0.14);
|
|
|
border-radius: 12px;
|
|
|
padding: 14px 16px;
|
|
|
font-size: 15px;
|
|
|
line-height: 1.6;
|
|
|
font-family: inherit;
|
|
|
background: #f9faff;
|
|
|
transition: border-color 0.2s, box-shadow 0.2s;
|
|
|
resize: vertical;
|
|
|
}
|
|
|
|
|
|
.text-input:focus {
|
|
|
outline: none;
|
|
|
border-color: var(--accent);
|
|
|
box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18);
|
|
|
background: #fff;
|
|
|
}
|
|
|
|
|
|
#streamingPreviewContainer {
|
|
|
border-radius: 14px;
|
|
|
border: 1px solid var(--border);
|
|
|
background: linear-gradient(135deg, #eef2ff 0%, #f7f9ff 100%);
|
|
|
padding: 18px 20px;
|
|
|
box-shadow: inset 0 1px 2px rgba(85, 98, 255, 0.12);
|
|
|
}
|
|
|
|
|
|
#streamingPreviewHeader {
|
|
|
font-weight: 600;
|
|
|
color: var(--text-primary);
|
|
|
display: flex;
|
|
|
align-items: center;
|
|
|
gap: 10px;
|
|
|
font-size: 14px;
|
|
|
margin-bottom: 8px;
|
|
|
}
|
|
|
|
|
|
#streamingPreviewNote {
|
|
|
font-weight: 400;
|
|
|
font-size: 12px;
|
|
|
color: var(--text-muted);
|
|
|
}
|
|
|
|
|
|
#streamingPreview {
|
|
|
min-height: 70px;
|
|
|
padding: 10px 12px;
|
|
|
border-radius: 10px;
|
|
|
background: rgba(255, 255, 255, 0.9);
|
|
|
border: 1px solid rgba(85, 98, 255, 0.25);
|
|
|
font-family: 'Courier New', Courier, monospace;
|
|
|
font-size: 14px;
|
|
|
line-height: 1.5;
|
|
|
color: var(--text-primary);
|
|
|
white-space: pre-wrap;
|
|
|
}
|
|
|
|
|
|
#streamingPreview.streaming-active::after {
|
|
|
content: "";
|
|
|
display: inline-block;
|
|
|
width: 2px;
|
|
|
height: 1.1em;
|
|
|
background: var(--accent);
|
|
|
margin-left: 2px;
|
|
|
animation: previewCaret 0.9s steps(1) infinite;
|
|
|
vertical-align: bottom;
|
|
|
}
|
|
|
|
|
|
@keyframes previewCaret {
|
|
|
0%, 50% {
|
|
|
opacity: 1;
|
|
|
}
|
|
|
51%, 100% {
|
|
|
opacity: 0;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
.control-panel {
|
|
|
display: flex;
|
|
|
flex-direction: column;
|
|
|
gap: 18px;
|
|
|
}
|
|
|
|
|
|
.inline-field {
|
|
|
display: flex;
|
|
|
flex-direction: column;
|
|
|
gap: 6px;
|
|
|
}
|
|
|
|
|
|
.select-control {
|
|
|
width: 220px;
|
|
|
border: 1px solid rgba(31, 39, 66, 0.14);
|
|
|
border-radius: 10px;
|
|
|
padding: 8px 12px;
|
|
|
font-size: 14px;
|
|
|
font-family: inherit;
|
|
|
background: #fbfcff;
|
|
|
color: var(--text-primary);
|
|
|
transition: border-color 0.2s, box-shadow 0.2s;
|
|
|
}
|
|
|
|
|
|
.select-control:focus {
|
|
|
outline: none;
|
|
|
border-color: var(--accent);
|
|
|
box-shadow: 0 0 0 3px rgba(85, 98, 255, 0.18);
|
|
|
background: #fff;
|
|
|
}
|
|
|
|
|
|
.control-row {
|
|
|
display: flex;
|
|
|
align-items: center;
|
|
|
flex-wrap: wrap;
|
|
|
gap: 20px 28px;
|
|
|
}
|
|
|
|
|
|
.range-control {
|
|
|
display: flex;
|
|
|
align-items: center;
|
|
|
gap: 12px;
|
|
|
font-size: 14px;
|
|
|
color: var(--text-primary);
|
|
|
}
|
|
|
|
|
|
.range-control input[type="range"] {
|
|
|
width: 200px;
|
|
|
accent-color: var(--accent);
|
|
|
}
|
|
|
|
|
|
.range-value {
|
|
|
font-weight: 600;
|
|
|
color: var(--text-primary);
|
|
|
min-width: 42px;
|
|
|
text-align: right;
|
|
|
}
|
|
|
|
|
|
#playback {
|
|
|
background: var(--accent);
|
|
|
color: #fff;
|
|
|
border: none;
|
|
|
padding: 10px 24px;
|
|
|
border-radius: 999px;
|
|
|
cursor: pointer;
|
|
|
font-weight: 600;
|
|
|
font-size: 14px;
|
|
|
box-shadow: 0 8px 16px rgba(85, 98, 255, 0.25);
|
|
|
transition: transform 0.15s, box-shadow 0.15s, background 0.15s;
|
|
|
}
|
|
|
|
|
|
#playback:hover {
|
|
|
transform: translateY(-1px);
|
|
|
box-shadow: 0 10px 20px rgba(85, 98, 255, 0.28);
|
|
|
}
|
|
|
|
|
|
#playback:active {
|
|
|
transform: translateY(0);
|
|
|
}
|
|
|
|
|
|
#playback.playing {
|
|
|
background: var(--accent-strong);
|
|
|
}
|
|
|
|
|
|
.secondary-btn {
|
|
|
border: 1px solid rgba(31, 39, 66, 0.18);
|
|
|
background: #f1f3ff;
|
|
|
color: var(--text-primary);
|
|
|
padding: 8px 18px;
|
|
|
border-radius: 999px;
|
|
|
cursor: pointer;
|
|
|
font-size: 13px;
|
|
|
font-weight: 500;
|
|
|
transition: background 0.15s, border-color 0.15s;
|
|
|
}
|
|
|
|
|
|
.secondary-btn:hover {
|
|
|
background: #e6e9ff;
|
|
|
border-color: rgba(31, 39, 66, 0.26);
|
|
|
}
|
|
|
|
|
|
.secondary-btn:disabled {
|
|
|
opacity: 0.55;
|
|
|
cursor: not-allowed;
|
|
|
}
|
|
|
|
|
|
.metrics {
|
|
|
display: flex;
|
|
|
flex-wrap: wrap;
|
|
|
gap: 16px 32px;
|
|
|
font-size: 14px;
|
|
|
color: var(--text-muted);
|
|
|
}
|
|
|
|
|
|
.metrics span {
|
|
|
display: flex;
|
|
|
align-items: baseline;
|
|
|
gap: 6px;
|
|
|
}
|
|
|
|
|
|
.metrics span strong {
|
|
|
color: var(--text-primary);
|
|
|
font-weight: 600;
|
|
|
}
|
|
|
|
|
|
.metric-unit {
|
|
|
color: var(--text-muted);
|
|
|
font-size: 13px;
|
|
|
}
|
|
|
|
|
|
#logOutput {
|
|
|
max-height: 260px;
|
|
|
overflow-y: auto;
|
|
|
background: #f7f9ff;
|
|
|
color: var(--text-primary);
|
|
|
padding: 16px 18px;
|
|
|
border: 1px solid rgba(31, 39, 66, 0.12);
|
|
|
border-radius: 12px;
|
|
|
font-size: 13px;
|
|
|
line-height: 1.6;
|
|
|
box-shadow: inset 0 1px 2px rgba(15, 23, 42, 0.06);
|
|
|
font-family: 'Fira Code', 'Courier New', Courier, monospace;
|
|
|
margin-top: 0px;
|
|
|
}
|
|
|
|
|
|
@media (max-width: 720px) {
|
|
|
.app-shell {
|
|
|
padding: 28px 20px 36px;
|
|
|
gap: 24px;
|
|
|
}
|
|
|
|
|
|
.select-control {
|
|
|
width: 100%;
|
|
|
}
|
|
|
|
|
|
.control-row {
|
|
|
flex-direction: column;
|
|
|
align-items: flex-start;
|
|
|
gap: 16px;
|
|
|
}
|
|
|
|
|
|
#playback {
|
|
|
width: 100%;
|
|
|
text-align: center;
|
|
|
}
|
|
|
}
|
|
|
</style>
|
|
|
<body>
|
|
|
<div class="app-shell">
|
|
|
<h1>VibeVoice-Realtime TTS Demo</h1>
|
|
|
|
|
|
<section class="panel">
|
|
|
<label class="field">
|
|
|
<span class="field-label">Text</span>
|
|
|
<textarea
|
|
|
id="prompt"
|
|
|
class="text-input"
|
|
|
rows="4"
|
|
|
>Enter your text here and click "Start" to instantly hear the VibeVoice-Realtime TTS output audio.</textarea>
|
|
|
</label>
|
|
|
|
|
|
<div id="streamingPreviewContainer">
|
|
|
<div id="streamingPreviewHeader">
|
|
|
<span>Streaming Input Text</span>
|
|
|
</div>
|
|
|
<div id="streamingPreview" aria-live="polite">This area will display the streaming input text in real time.</div>
|
|
|
</div>
|
|
|
</section>
|
|
|
<span class="helper-text">This demo requires the full text to be provided upfront. The model then receives the text via streaming input during synthesis.<br>
|
|
|
For non-punctuation special characters, applying text normalization before processing often yields better results.</span>
|
|
|
|
|
|
<section class="panel control-panel">
|
|
|
<div class="inline-field">
|
|
|
<span class="field-label">Speaker</span>
|
|
|
<select id="voiceSelect" class="select-control">
|
|
|
<option value="">Loading...</option>
|
|
|
</select>
|
|
|
</div>
|
|
|
|
|
|
<div class="control-row">
|
|
|
<label class="range-control">
|
|
|
<span>CFG</span>
|
|
|
<input id="cfgScale" type="range" min="1" max="3" step="0.05" value="1.5" />
|
|
|
<span class="range-value" id="cfgValue">1.5</span>
|
|
|
</label>
|
|
|
<label class="range-control">
|
|
|
<span>Inference Steps</span>
|
|
|
<input id="inferenceSteps" type="range" min="1" max="20" step="1" value="5" />
|
|
|
<span class="range-value" id="stepsValue">5</span>
|
|
|
</label>
|
|
|
<button id="resetControls" type="button" class="secondary-btn">Reset Controls</button>
|
|
|
</div>
|
|
|
|
|
|
<div class="control-row">
|
|
|
<button id="playback">Start</button>
|
|
|
<button id="saveAudio" type="button" class="secondary-btn" disabled>Save</button>
|
|
|
</div>
|
|
|
</section>
|
|
|
|
|
|
<section class="panel">
|
|
|
<div class="metrics">
|
|
|
<span>Model Generated Audio<strong id="modelGenerated">0.00</strong><span class="metric-unit">s</span></span>
|
|
|
<span>Audio Played<strong id="playbackElapsed">0.00</strong><span class="metric-unit">s</span></span>
|
|
|
</div>
|
|
|
</section>
|
|
|
|
|
|
<section class="panel">
|
|
|
<span class="field-label">Runtime Logs</span>
|
|
|
<pre id="logOutput"></pre>
|
|
|
</section>
|
|
|
</div>
|
|
|
|
|
|
|
|
|
<script>
|
|
|
(() => {
|
|
|
const SAMPLE_RATE = 24_000;
|
|
|
const BUFFER_SIZE = 2048;
|
|
|
const PREBUFFER_SEC = 0.1;
|
|
|
|
|
|
let audioCtx = null;
|
|
|
let scriptNode = null;
|
|
|
let socket = null;
|
|
|
let buffer = new Float32Array(0);
|
|
|
let isPlaying = false;
|
|
|
let hasStartedPlayback = false;
|
|
|
let silentFrameCount = 0;
|
|
|
|
|
|
const promptInput = document.getElementById('prompt');
|
|
|
const streamingPreview = document.getElementById('streamingPreview');
|
|
|
const controlBtn = document.getElementById('playback');
|
|
|
const cfgSelect = document.getElementById('cfgScale');
|
|
|
const stepsSelect = document.getElementById('inferenceSteps');
|
|
|
const voiceSelect = document.getElementById('voiceSelect');
|
|
|
const cfgValueLabel = document.getElementById('cfgValue');
|
|
|
const stepsValueLabel = document.getElementById('stepsValue');
|
|
|
const modelGeneratedLabel = document.getElementById('modelGenerated');
|
|
|
const playbackElapsedLabel = document.getElementById('playbackElapsed');
|
|
|
const logOutput = document.getElementById('logOutput');
|
|
|
const resetBtn = document.getElementById('resetControls');
|
|
|
const saveBtn = document.getElementById('saveAudio');
|
|
|
|
|
|
let playbackTimer = null;
|
|
|
let lastPlaybackElapsed = 0;
|
|
|
let playbackSamples = 0;
|
|
|
let modelGeneratedTotal = 0;
|
|
|
let firstBrowserChunkLogged = false;
|
|
|
let playbackStartedLogged = false;
|
|
|
const logEntries = [];
|
|
|
let logSequence = 0;
|
|
|
let recordedChunks = [];
|
|
|
let recordedSamples = 0;
|
|
|
let recordingComplete = false;
|
|
|
let downloadUrl = null;
|
|
|
|
|
|
const revokeDownloadUrl = () => {
|
|
|
if (downloadUrl) {
|
|
|
URL.revokeObjectURL(downloadUrl);
|
|
|
downloadUrl = null;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
const updateSaveButtonState = () => {
|
|
|
if (!saveBtn) {
|
|
|
return;
|
|
|
}
|
|
|
saveBtn.disabled = recordedSamples === 0 || !recordingComplete;
|
|
|
};
|
|
|
|
|
|
const clearRecordedChunks = () => {
|
|
|
recordedChunks = [];
|
|
|
recordedSamples = 0;
|
|
|
recordingComplete = false;
|
|
|
revokeDownloadUrl();
|
|
|
updateSaveButtonState();
|
|
|
};
|
|
|
|
|
|
const createWavBlob = () => {
|
|
|
if (!recordedSamples) {
|
|
|
return null;
|
|
|
}
|
|
|
const wavBuffer = new ArrayBuffer(44 + recordedSamples * 2);
|
|
|
const view = new DataView(wavBuffer);
|
|
|
const writeString = (offset, str) => {
|
|
|
for (let i = 0; i < str.length; i += 1) {
|
|
|
view.setUint8(offset + i, str.charCodeAt(i));
|
|
|
}
|
|
|
};
|
|
|
|
|
|
writeString(0, 'RIFF');
|
|
|
view.setUint32(4, 36 + recordedSamples * 2, true);
|
|
|
writeString(8, 'WAVE');
|
|
|
writeString(12, 'fmt ');
|
|
|
view.setUint32(16, 16, true);
|
|
|
view.setUint16(20, 1, true);
|
|
|
view.setUint16(22, 1, true);
|
|
|
view.setUint32(24, SAMPLE_RATE, true);
|
|
|
view.setUint32(28, SAMPLE_RATE * 2, true);
|
|
|
view.setUint16(32, 2, true);
|
|
|
view.setUint16(34, 16, true);
|
|
|
writeString(36, 'data');
|
|
|
view.setUint32(40, recordedSamples * 2, true);
|
|
|
|
|
|
const pcmData = new Int16Array(wavBuffer, 44, recordedSamples);
|
|
|
let offset = 0;
|
|
|
recordedChunks.forEach(chunk => {
|
|
|
const chunkData = new Int16Array(chunk);
|
|
|
pcmData.set(chunkData, offset);
|
|
|
offset += chunkData.length;
|
|
|
});
|
|
|
return new Blob([wavBuffer], { type: 'audio/wav' });
|
|
|
};
|
|
|
|
|
|
const updateCfgDisplay = () => {
|
|
|
cfgValueLabel.textContent = Number(cfgSelect.value).toFixed(3);
|
|
|
};
|
|
|
|
|
|
const updateStepsDisplay = () => {
|
|
|
stepsValueLabel.textContent = Number(stepsSelect.value).toString();
|
|
|
};
|
|
|
|
|
|
cfgSelect.addEventListener('input', updateCfgDisplay);
|
|
|
stepsSelect.addEventListener('input', updateStepsDisplay);
|
|
|
updateCfgDisplay();
|
|
|
updateStepsDisplay();
|
|
|
|
|
|
const pad2 = value => value.toString().padStart(2, '0');
|
|
|
const pad3 = value => value.toString().padStart(3, '0');
|
|
|
|
|
|
const formatLocalTimestamp = () => {
|
|
|
const d = new Date();
|
|
|
const year = d.getFullYear();
|
|
|
const month = pad2(d.getMonth() + 1);
|
|
|
const day = pad2(d.getDate());
|
|
|
const hours = pad2(d.getHours());
|
|
|
const minutes = pad2(d.getMinutes());
|
|
|
const seconds = pad2(d.getSeconds());
|
|
|
const millis = pad3(d.getMilliseconds());
|
|
|
return `${year}-${month}-${day} ${hours}:${minutes}:${seconds}.${millis}`;
|
|
|
};
|
|
|
|
|
|
const formatSeconds = raw => {
|
|
|
const value = Number(raw);
|
|
|
return Number.isFinite(value) ? value.toFixed(2) : '0.00';
|
|
|
};
|
|
|
|
|
|
const parseTimestamp = value => {
|
|
|
if (!value) {
|
|
|
return new Date();
|
|
|
}
|
|
|
if (/\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}\.\d{3}/.test(value)) {
|
|
|
return new Date(value.replace(' ', 'T'));
|
|
|
}
|
|
|
return new Date(value);
|
|
|
};
|
|
|
|
|
|
const setModelGenerated = value => {
|
|
|
const numeric = Number(value);
|
|
|
if (!Number.isFinite(numeric)) {
|
|
|
return;
|
|
|
}
|
|
|
modelGeneratedTotal = Math.max(0, numeric);
|
|
|
modelGeneratedLabel.textContent = formatSeconds(modelGeneratedTotal);
|
|
|
};
|
|
|
|
|
|
const setPlaybackElapsed = value => {
|
|
|
const capped = Math.min(modelGeneratedTotal, Math.max(0, value));
|
|
|
lastPlaybackElapsed = capped;
|
|
|
playbackElapsedLabel.textContent = formatSeconds(lastPlaybackElapsed);
|
|
|
};
|
|
|
|
|
|
const STREAMING_WPM = 180;
|
|
|
const STREAMING_INTERVAL_MS = 60000 / STREAMING_WPM;
|
|
|
let previewTimeoutId = null;
|
|
|
let previewTokens = [];
|
|
|
let previewIndex = 0;
|
|
|
let previewActive = false;
|
|
|
|
|
|
const clearPreviewTimer = () => {
|
|
|
if (previewTimeoutId) {
|
|
|
clearTimeout(previewTimeoutId);
|
|
|
previewTimeoutId = null;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
const setPreviewIdle = message => {
|
|
|
if (!streamingPreview) {
|
|
|
return;
|
|
|
}
|
|
|
streamingPreview.classList.remove('streaming-active');
|
|
|
streamingPreview.textContent = message;
|
|
|
};
|
|
|
|
|
|
const schedulePreviewTick = () => {
|
|
|
if (!streamingPreview) {
|
|
|
return;
|
|
|
}
|
|
|
if (previewIndex >= previewTokens.length) {
|
|
|
streamingPreview.classList.remove('streaming-active');
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
streamingPreview.classList.add('streaming-active');
|
|
|
|
|
|
streamingPreview.textContent += previewTokens[previewIndex];
|
|
|
previewIndex += 1;
|
|
|
previewTimeoutId = setTimeout(schedulePreviewTick, STREAMING_INTERVAL_MS);
|
|
|
};
|
|
|
|
|
|
const updateStreamingPreview = () => {
|
|
|
if (!streamingPreview) {
|
|
|
return;
|
|
|
}
|
|
|
clearPreviewTimer();
|
|
|
previewIndex = 0;
|
|
|
const source = (promptInput?.value || '').trimEnd();
|
|
|
streamingPreview.textContent = '';
|
|
|
previewTokens = source.match(/\S+\s*/g) || [];
|
|
|
schedulePreviewTick();
|
|
|
};
|
|
|
|
|
|
const clearLogs = () => {
|
|
|
if (logOutput) {
|
|
|
logOutput.textContent = '';
|
|
|
}
|
|
|
logEntries.length = 0;
|
|
|
modelGeneratedTotal = 0;
|
|
|
setModelGenerated(0);
|
|
|
};
|
|
|
|
|
|
const appendLog = (message, timestamp) => {
|
|
|
if (!logOutput) {
|
|
|
return;
|
|
|
}
|
|
|
const finalTimestamp = timestamp || formatLocalTimestamp();
|
|
|
const entry = {
|
|
|
timestamp: finalTimestamp,
|
|
|
date: parseTimestamp(finalTimestamp),
|
|
|
message,
|
|
|
seq: logSequence += 1,
|
|
|
};
|
|
|
logEntries.push(entry);
|
|
|
logEntries.sort((a, b) => {
|
|
|
const diff = a.date.getTime() - b.date.getTime();
|
|
|
return diff !== 0 ? diff : a.seq - b.seq;
|
|
|
});
|
|
|
if (logEntries.length > 400) {
|
|
|
logEntries.splice(0, logEntries.length - 400);
|
|
|
}
|
|
|
logOutput.textContent = logEntries
|
|
|
.map(item => `[${item.timestamp}] ${item.message}`)
|
|
|
.join('\n');
|
|
|
logOutput.scrollTop = logOutput.scrollHeight;
|
|
|
};
|
|
|
|
|
|
const handleSaveClick = () => {
|
|
|
if (!recordedSamples) {
|
|
|
appendLog('[Frontend] Save requested but no audio received yet');
|
|
|
return;
|
|
|
}
|
|
|
const wavBlob = createWavBlob();
|
|
|
if (!wavBlob) {
|
|
|
appendLog('[Error] Failed to assemble WAV data for download');
|
|
|
return;
|
|
|
}
|
|
|
revokeDownloadUrl();
|
|
|
downloadUrl = URL.createObjectURL(wavBlob);
|
|
|
const link = document.createElement('a');
|
|
|
const timestamp = new Date().toISOString().replace(/[:.]/g, '-');
|
|
|
link.href = downloadUrl;
|
|
|
link.download = `vibevoice_realtime_audio_${timestamp}.wav`;
|
|
|
document.body.appendChild(link);
|
|
|
link.click();
|
|
|
document.body.removeChild(link);
|
|
|
appendLog('[Frontend] Audio download triggered');
|
|
|
};
|
|
|
|
|
|
const stopPlaybackTimer = () => {
|
|
|
if (playbackTimer) {
|
|
|
clearInterval(playbackTimer);
|
|
|
playbackTimer = null;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
const startPlaybackTimer = () => {
|
|
|
stopPlaybackTimer();
|
|
|
playbackTimer = setInterval(() => {
|
|
|
setPlaybackElapsed(playbackSamples / SAMPLE_RATE);
|
|
|
}, 250);
|
|
|
};
|
|
|
|
|
|
const loadVoices = async () => {
|
|
|
try {
|
|
|
voiceSelect.disabled = true;
|
|
|
const response = await fetch('/config');
|
|
|
if (!response.ok) {
|
|
|
throw new Error(`Failed to fetch config: ${response.status}`);
|
|
|
}
|
|
|
const data = await response.json();
|
|
|
const voices = Array.isArray(data.voices) ? data.voices : [];
|
|
|
voiceSelect.innerHTML = '';
|
|
|
if (voices.length === 0) {
|
|
|
const option = document.createElement('option');
|
|
|
option.value = '';
|
|
|
option.textContent = 'No voices available';
|
|
|
voiceSelect.appendChild(option);
|
|
|
voiceSelect.disabled = true;
|
|
|
appendLog('[Error] No voice presets available');
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
voices.forEach(voice => {
|
|
|
const option = document.createElement('option');
|
|
|
option.value = voice;
|
|
|
option.textContent = voice;
|
|
|
voiceSelect.appendChild(option);
|
|
|
});
|
|
|
|
|
|
if (data.default_voice && voices.includes(data.default_voice)) {
|
|
|
voiceSelect.value = data.default_voice;
|
|
|
}
|
|
|
voiceSelect.disabled = false;
|
|
|
appendLog(`[Frontend] Loaded ${voices.length} voice presets`);
|
|
|
} catch (err) {
|
|
|
console.error('Failed to load voices', err);
|
|
|
voiceSelect.innerHTML = '';
|
|
|
const option = document.createElement('option');
|
|
|
option.value = '';
|
|
|
option.textContent = 'Load failed';
|
|
|
voiceSelect.appendChild(option);
|
|
|
voiceSelect.disabled = true;
|
|
|
appendLog('[Error] Failed to load voice presets');
|
|
|
}
|
|
|
};
|
|
|
|
|
|
loadVoices();
|
|
|
|
|
|
resetBtn.addEventListener('click', () => {
|
|
|
cfgSelect.value = '1.5';
|
|
|
stepsSelect.value = '5';
|
|
|
updateCfgDisplay();
|
|
|
updateStepsDisplay();
|
|
|
appendLog('[Frontend] Controls reset to defaults (CFG=1.5, Steps=5)');
|
|
|
});
|
|
|
|
|
|
if (promptInput) {
|
|
|
promptInput.addEventListener('input', () => {
|
|
|
if (previewActive) {
|
|
|
updateStreamingPreview();
|
|
|
}
|
|
|
});
|
|
|
}
|
|
|
|
|
|
const handleLogMessage = raw => {
|
|
|
let payload;
|
|
|
try {
|
|
|
payload = JSON.parse(raw);
|
|
|
} catch (err) {
|
|
|
appendLog(`[Error] Failed to parse log message: ${raw}`);
|
|
|
return;
|
|
|
}
|
|
|
if (!payload || payload.type !== 'log') {
|
|
|
appendLog(`[Log] ${raw}`);
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
const { event, data = {}, timestamp } = payload;
|
|
|
switch (event) {
|
|
|
case 'backend_request_received': {
|
|
|
const cfg = typeof data.cfg_scale === 'number' ? data.cfg_scale.toFixed(3) : data.cfg_scale;
|
|
|
const steps = data.inference_steps ?? 'default';
|
|
|
const voice = data.voice || 'default';
|
|
|
const textLength = data.text_length ?? 0;
|
|
|
appendLog(`[Backend] Received request`, timestamp);
|
|
|
break;
|
|
|
}
|
|
|
case 'backend_first_chunk_sent':
|
|
|
appendLog('[Backend] Sent first audio chunk', timestamp);
|
|
|
break;
|
|
|
case 'model_progress':
|
|
|
if (typeof data.generated_sec !== 'undefined') {
|
|
|
const generated = Number(data.generated_sec);
|
|
|
if (Number.isFinite(generated)) {
|
|
|
setModelGenerated(generated);
|
|
|
}
|
|
|
}
|
|
|
return;
|
|
|
case 'generation_error':
|
|
|
appendLog(`[Error] Generation error: ${data.message || 'Unknown error'}`, timestamp);
|
|
|
break;
|
|
|
case 'backend_error':
|
|
|
appendLog(`[Error] Backend error: ${data.message || 'Unknown error'}`, timestamp);
|
|
|
break;
|
|
|
case 'client_disconnected':
|
|
|
appendLog('[Frontend] Client disconnected', timestamp);
|
|
|
break;
|
|
|
case 'backend_stream_complete':
|
|
|
appendLog('[Backend] Backend finished', timestamp);
|
|
|
recordingComplete = true;
|
|
|
updateSaveButtonState();
|
|
|
break;
|
|
|
default:
|
|
|
appendLog(`[Log] Event ${event}`, timestamp);
|
|
|
break;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
const updateButtonLabel = () => {
|
|
|
controlBtn.textContent = isPlaying ? 'Stop' : 'Start';
|
|
|
controlBtn.classList.toggle('playing', isPlaying);
|
|
|
};
|
|
|
|
|
|
const appendAudio = chunk => {
|
|
|
const merged = new Float32Array(buffer.length + chunk.length);
|
|
|
merged.set(buffer, 0);
|
|
|
merged.set(chunk, buffer.length);
|
|
|
buffer = merged;
|
|
|
};
|
|
|
|
|
|
const pullAudio = frameCount => {
|
|
|
const available = buffer.length;
|
|
|
if (available === 0) {
|
|
|
return new Float32Array(frameCount);
|
|
|
}
|
|
|
if (available <= frameCount) {
|
|
|
const chunk = buffer;
|
|
|
buffer = new Float32Array(0);
|
|
|
if (chunk.length < frameCount) {
|
|
|
const padded = new Float32Array(frameCount);
|
|
|
padded.set(chunk, 0);
|
|
|
return padded;
|
|
|
}
|
|
|
return chunk;
|
|
|
}
|
|
|
const chunk = buffer.subarray(0, frameCount);
|
|
|
buffer = buffer.subarray(frameCount);
|
|
|
return chunk;
|
|
|
};
|
|
|
|
|
|
const closeSocket = () => {
|
|
|
if (socket && (socket.readyState === WebSocket.OPEN || socket.readyState === WebSocket.CONNECTING)) {
|
|
|
socket.close();
|
|
|
}
|
|
|
socket = null;
|
|
|
};
|
|
|
|
|
|
const resetPlaybackFlags = (resetSamples = true) => {
|
|
|
buffer = new Float32Array(0);
|
|
|
if (resetSamples) {
|
|
|
playbackSamples = 0;
|
|
|
setPlaybackElapsed(0);
|
|
|
}
|
|
|
hasStartedPlayback = false;
|
|
|
silentFrameCount = 0;
|
|
|
firstBrowserChunkLogged = false;
|
|
|
playbackStartedLogged = false;
|
|
|
};
|
|
|
|
|
|
const teardownAudio = () => {
|
|
|
if (scriptNode) {
|
|
|
try { scriptNode.disconnect(); } catch (err) { console.warn('disconnect error', err); }
|
|
|
scriptNode.onaudioprocess = null;
|
|
|
}
|
|
|
if (audioCtx) {
|
|
|
try { audioCtx.close(); } catch (err) { console.warn('audioCtx.close error', err); }
|
|
|
}
|
|
|
audioCtx = null;
|
|
|
scriptNode = null;
|
|
|
};
|
|
|
|
|
|
const resetState = (resetSamples = true) => {
|
|
|
closeSocket();
|
|
|
teardownAudio();
|
|
|
resetPlaybackFlags(resetSamples);
|
|
|
isPlaying = false;
|
|
|
stopPlaybackTimer();
|
|
|
};
|
|
|
|
|
|
const createAudioChain = () => {
|
|
|
teardownAudio();
|
|
|
resetPlaybackFlags();
|
|
|
audioCtx = new (window.AudioContext || window.webkitAudioContext)({ sampleRate: SAMPLE_RATE });
|
|
|
scriptNode = audioCtx.createScriptProcessor(BUFFER_SIZE, 0, 1);
|
|
|
|
|
|
const minBufferSamples = Math.floor(audioCtx.sampleRate * PREBUFFER_SEC);
|
|
|
|
|
|
scriptNode.onaudioprocess = event => {
|
|
|
const output = event.outputBuffer.getChannelData(0);
|
|
|
const needPrebuffer = !hasStartedPlayback;
|
|
|
const socketClosed = !socket || socket.readyState === WebSocket.CLOSED || socket.readyState === WebSocket.CLOSING;
|
|
|
|
|
|
if (needPrebuffer) {
|
|
|
if (buffer.length >= minBufferSamples || socketClosed) {
|
|
|
hasStartedPlayback = true;
|
|
|
if (!playbackStartedLogged) {
|
|
|
playbackStartedLogged = true;
|
|
|
appendLog('[Frontend] Browser started to play audio');
|
|
|
startPlaybackTimer();
|
|
|
}
|
|
|
} else {
|
|
|
output.fill(0);
|
|
|
return;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
const chunk = pullAudio(output.length);
|
|
|
output.set(chunk);
|
|
|
|
|
|
if (hasStartedPlayback) {
|
|
|
playbackSamples += output.length;
|
|
|
}
|
|
|
|
|
|
if (socketClosed && buffer.length === 0 && chunk.every(sample => sample === 0)) {
|
|
|
silentFrameCount += 1;
|
|
|
if (silentFrameCount >= 4) {
|
|
|
stop();
|
|
|
}
|
|
|
} else {
|
|
|
silentFrameCount = 0;
|
|
|
}
|
|
|
};
|
|
|
|
|
|
scriptNode.connect(audioCtx.destination);
|
|
|
};
|
|
|
|
|
|
const start = () => {
|
|
|
if (isPlaying) {
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
const textValue = promptInput?.value || '';
|
|
|
const cfgValue = Number(cfgSelect.value);
|
|
|
const stepsValue = Number(stepsSelect.value);
|
|
|
const voiceValue = voiceSelect.value || '';
|
|
|
|
|
|
clearLogs();
|
|
|
const cfgDisplay = Number.isFinite(cfgValue) ? cfgValue.toFixed(3) : 'default';
|
|
|
const stepsDisplay = Number.isFinite(stepsValue) ? stepsValue : 'default';
|
|
|
appendLog(`[Frontend] Start button clicked, CFG=${cfgDisplay}, Steps=${stepsDisplay}, Speaker=${voiceValue || 'default'}`);
|
|
|
setModelGenerated(0);
|
|
|
setPlaybackElapsed(0);
|
|
|
|
|
|
resetState(true);
|
|
|
clearRecordedChunks();
|
|
|
isPlaying = true;
|
|
|
previewActive = true;
|
|
|
updateStreamingPreview();
|
|
|
updateButtonLabel();
|
|
|
createAudioChain();
|
|
|
|
|
|
const params = new URLSearchParams();
|
|
|
params.set('text', textValue);
|
|
|
if (!Number.isNaN(cfgValue)) {
|
|
|
params.set('cfg', cfgValue.toFixed(3));
|
|
|
}
|
|
|
if (!Number.isNaN(stepsValue)) {
|
|
|
params.set('steps', stepsValue.toString());
|
|
|
}
|
|
|
if (voiceValue) {
|
|
|
params.set('voice', voiceValue);
|
|
|
}
|
|
|
const wsUrl = `${location.origin.replace(/^http/, 'ws')}/stream?${params.toString()}`;
|
|
|
|
|
|
socket = new WebSocket(wsUrl);
|
|
|
socket.binaryType = 'arraybuffer';
|
|
|
|
|
|
socket.onmessage = event => {
|
|
|
if (typeof event.data === 'string') {
|
|
|
handleLogMessage(event.data);
|
|
|
return;
|
|
|
}
|
|
|
|
|
|
if (!(event.data instanceof ArrayBuffer)) {
|
|
|
return;
|
|
|
}
|
|
|
const rawBuffer = event.data.slice(0);
|
|
|
const view = new DataView(rawBuffer);
|
|
|
const floatChunk = new Float32Array(view.byteLength / 2);
|
|
|
for (let i = 0; i < floatChunk.length; i += 1) {
|
|
|
floatChunk[i] = view.getInt16(i * 2, true) / 32768;
|
|
|
}
|
|
|
appendAudio(floatChunk);
|
|
|
recordedChunks.push(rawBuffer);
|
|
|
recordedSamples += floatChunk.length;
|
|
|
updateSaveButtonState();
|
|
|
|
|
|
if (!firstBrowserChunkLogged) {
|
|
|
firstBrowserChunkLogged = true;
|
|
|
appendLog('[Frontend] Received first audio chunk');
|
|
|
}
|
|
|
};
|
|
|
|
|
|
socket.onerror = err => {
|
|
|
console.error('WebSocket error', err);
|
|
|
appendLog(`[Error] WebSocket error: ${err?.message || err}`);
|
|
|
stop();
|
|
|
};
|
|
|
|
|
|
socket.onclose = () => {
|
|
|
socket = null;
|
|
|
if (recordedSamples > 0) {
|
|
|
recordingComplete = true;
|
|
|
updateSaveButtonState();
|
|
|
}
|
|
|
};
|
|
|
};
|
|
|
|
|
|
const stop = () => {
|
|
|
if (!isPlaying) {
|
|
|
resetState(false);
|
|
|
updateButtonLabel();
|
|
|
return;
|
|
|
}
|
|
|
resetState(false);
|
|
|
setPlaybackElapsed(Math.min(lastPlaybackElapsed, modelGeneratedTotal));
|
|
|
appendLog('[Frontend] Playback stopped');
|
|
|
if (recordedSamples > 0) {
|
|
|
recordingComplete = true;
|
|
|
updateSaveButtonState();
|
|
|
}
|
|
|
previewActive = false;
|
|
|
clearPreviewTimer();
|
|
|
streamingPreview?.classList.remove('streaming-active');
|
|
|
updateButtonLabel();
|
|
|
};
|
|
|
|
|
|
controlBtn.addEventListener('click', () => {
|
|
|
if (isPlaying) {
|
|
|
stop();
|
|
|
} else {
|
|
|
start();
|
|
|
}
|
|
|
});
|
|
|
if (saveBtn) {
|
|
|
saveBtn.addEventListener('click', handleSaveClick);
|
|
|
}
|
|
|
updateButtonLabel();
|
|
|
updateSaveButtonState();
|
|
|
window.addEventListener('beforeunload', () => {
|
|
|
resetState();
|
|
|
clearPreviewTimer();
|
|
|
revokeDownloadUrl();
|
|
|
});
|
|
|
})();
|
|
|
</script>
|
|
|
</body>
|
|
|
</html>
|
|
|
|