microsoft/TypeAgent

Public

mirrored fromhttps://github.com/microsoft/TypeAgentAvailable

CodeCommitsIssuesPull requestsActionsInsightsSecurity
ca5f385b23f2caef43ded65cb3d1fed21117e6cb

Branches

Tags

  • No tags available.
0Branches0Tags
Go to file
Add file
Code

Clone

HTTPS

Download ZIP

python/tts/speechT5/speechT5.py

89lines · modecode

1# Copyright (c) Microsoft Corporation.
2# Licensed under the MIT License.
3
4from io import BytesIO
5import wave;
6import uvicorn
7import logging
8import torch
9from fastapi import FastAPI
10from fastapi.middleware.cors import CORSMiddleware
11from fastapi.responses import JSONResponse, StreamingResponse
12from transformers import pipeline
13from datasets import load_dataset
14import struct
15from pydantic import BaseModel
16import time
17
18import os
19os.environ['KMP_DUPLICATE_LIB_OK']='True'
20
21# Configure logging
22logging.basicConfig(level=logging.ERROR, format='%(asctime)s - %(levelname)s - %(message)s')
23
24# Initialize the app
25app = FastAPI()
26
27# allow all cors
28app.add_middleware(
29 CORSMiddleware,
30 allow_origins=["*"],
31 allow_credentials=True,
32 allow_methods=["*"],
33 allow_headers=["*"],
34)
35
36# Load the model
37device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
38print('Using device:', device)
39print("Loading model...")
40synthesizer = pipeline("text-to-speech", "microsoft/speecht5_tts", device=device)
41embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
42filenames = []
43for (i, x) in enumerate(embeddings_dataset):
44 filenames.append([x["filename"],str(i)])
45speaker_embeddings = list(map((x := lambda x: torch.tensor(x["xvector"]).unsqueeze(0)), embeddings_dataset));
46
47print("Model loaded!")
48
49
50@app.get("/voices")
51async def voices():
52 return JSONResponse(content=filenames, status_code=200)
53
54class SynthesizeRequest(BaseModel):
55 text: str
56 voiceName: str | None = None
57
58@app.post("/synthesize")
59async def synthesize(data: SynthesizeRequest):
60 try:
61 text = data.text
62 if data.voiceName is None:
63 voiceName = 7306
64 else:
65 voiceName = int(data.voiceName)
66 print("Synthesizing with voice", voiceName, ":", text)
67 start = time.time()
68 speech = synthesizer(text, forward_params={"speaker_embeddings": speaker_embeddings[voiceName]})
69 end = time.time()
70 print("Synthesized in", end-start, "seconds")
71 byte_io = BytesIO()
72 f = wave.open(byte_io, 'wb')
73 f.setnchannels(1)
74 f.setsampwidth(3)
75 f.setframerate(speech["sampling_rate"])
76 data_as_bytes = (struct.pack('<i', int(samp*(2**23-1))) for samp in speech["audio"])
77 for data_bytes in data_as_bytes:
78 f.writeframes(data_bytes[0:3])
79 f.close()
80 byte_io.seek(0)
81 print("Written in", time.time()-end, "seconds")
82 return StreamingResponse(byte_io, media_type="audio/wav")
83 except Exception as e:
84 logging.error("An error occurred during synthesize", exc_info=True)
85 return JSONResponse(content={"error": "An internal error has occurred!"}, status_code=500)
86
87
88if __name__ == "__main__":
89 uvicorn.run(app, host="0.0.0.0", port=8002)