first commit
This commit is contained in:
BIN
Lab/Lab2/code/tang1.wav
Normal file
BIN
Lab/Lab2/code/tang1.wav
Normal file
Binary file not shown.
623
Lab/Lab2/code/test.ipynb
Normal file
623
Lab/Lab2/code/test.ipynb
Normal file
File diff suppressed because one or more lines are too long
179
Lab/Lab2/code/test.py
Normal file
179
Lab/Lab2/code/test.py
Normal file
@@ -0,0 +1,179 @@
|
||||
from typing import Optional
|
||||
import scipy.io.wavfile as wav
|
||||
import numpy as np
|
||||
import matplotlib.pyplot as plt
|
||||
import ipdb
|
||||
|
||||
|
||||
def hamming(frame_length: int) -> np.ndarray:
|
||||
# frame_length - 窗长
|
||||
|
||||
n = np.arange(frame_length)
|
||||
h = 0.54 - 0.4 * np.cos(2 * np.pi * n / (frame_length - 1))
|
||||
return h
|
||||
|
||||
|
||||
def delta_sgn(x: np.ndarray) -> np.ndarray:
|
||||
# x - 语音信号
|
||||
|
||||
sound = x
|
||||
threshold = np.max(np.abs(sound)) / 20
|
||||
negative_sound = sound + threshold
|
||||
negative_sound -= np.abs(negative_sound)
|
||||
positive_sound = sound - threshold
|
||||
positive_sound += np.abs(positive_sound)
|
||||
sound = negative_sound + positive_sound
|
||||
return np.sign(sound)
|
||||
|
||||
|
||||
def ampf(
|
||||
x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
|
||||
) -> np.ndarray:
|
||||
# x - 语音时域信号
|
||||
# FrameLen - 每一帧的长度
|
||||
# inc - 步长
|
||||
|
||||
frames = []
|
||||
for i in range(0, len(x) - FrameLen, inc):
|
||||
frame = x[i : i + FrameLen]
|
||||
frames.append(frame)
|
||||
frames = np.array(frames)
|
||||
|
||||
h = hamming(frame_length=FrameLen) # 海明窗
|
||||
amp = np.dot(frames**2, h.T**2).T / FrameLen
|
||||
|
||||
return amp
|
||||
|
||||
|
||||
def zcrf(
|
||||
x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
|
||||
) -> np.ndarray:
|
||||
# x - 语音时域信号
|
||||
# FrameLen - 每一帧的长度
|
||||
# inc - 步长
|
||||
|
||||
sound = x
|
||||
sgn_sound = np.sign(sound)
|
||||
|
||||
dif_sound = np.abs(sgn_sound[1:] - sgn_sound[:-1])
|
||||
h = np.ones((FrameLen,)) / (2 * FrameLen)
|
||||
|
||||
frames = []
|
||||
for i in range(0, len(dif_sound) - FrameLen, inc):
|
||||
frame = dif_sound[i : i + FrameLen]
|
||||
frames.append(frame)
|
||||
|
||||
frames = np.array(frames)
|
||||
zcr = np.dot(frames, h.T).T
|
||||
return zcr
|
||||
|
||||
|
||||
def zcrf_delta(
|
||||
x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
|
||||
) -> np.ndarray:
|
||||
# x - 语音时域信号
|
||||
# FrameLen - 每一帧的长度
|
||||
# inc - 步长
|
||||
|
||||
sound = x
|
||||
sgn_sound = delta_sgn(sound)
|
||||
|
||||
dif_sound = np.abs(sgn_sound[1:] - sgn_sound[:-1])
|
||||
h = np.ones((FrameLen,)) / (2 * FrameLen)
|
||||
|
||||
frames = []
|
||||
for i in range(0, len(dif_sound) - FrameLen, inc):
|
||||
frame = dif_sound[i : i + FrameLen]
|
||||
frames.append(frame)
|
||||
|
||||
frames = np.array(frames)
|
||||
zcr = np.dot(frames, h.T).T
|
||||
return zcr
|
||||
|
||||
|
||||
def analyze_sound(
|
||||
filename: str, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
|
||||
) -> None:
|
||||
sr, sound_array = wav.read(filename)
|
||||
sound_array = sound_array.T[0, :] if sound_array.ndim != 1 else sound_array
|
||||
sound_array = sound_array / np.max(np.abs(sound_array)) # 归一化
|
||||
|
||||
amp = ampf(sound_array, FrameLen, inc)
|
||||
zcr = zcrf_delta(sound_array, FrameLen, inc)
|
||||
|
||||
rescale_rate = len(sound_array) / amp.shape[0]
|
||||
frameTime = np.arange(len(amp)) * rescale_rate
|
||||
|
||||
# 边界检测
|
||||
x1 = []
|
||||
x2 = []
|
||||
x3 = []
|
||||
amp2 = np.min(amp) + (np.max(amp) - np.min(amp)) / 20
|
||||
zcr2 = np.min(zcr) + (np.max(zcr) - np.min(zcr)) / 18
|
||||
|
||||
threshold_len = 6
|
||||
state = 1
|
||||
for i in range(threshold_len, len(amp) - threshold_len):
|
||||
if state == 1:
|
||||
if np.all(zcr[i : i + threshold_len] > zcr2):
|
||||
x1.append(i * rescale_rate)
|
||||
state = 2
|
||||
elif state == 2:
|
||||
if np.all(amp[i : i + threshold_len] > amp2):
|
||||
x3.append(i * rescale_rate)
|
||||
state = 3
|
||||
if (
|
||||
state != 1
|
||||
and np.all(amp[i : i + threshold_len] < amp2)
|
||||
and np.all(zcr[i : i + threshold_len] < zcr2)
|
||||
):
|
||||
x2.append(i * rescale_rate)
|
||||
state = 1
|
||||
|
||||
# 绘制语音波形、短时能量、短时过零率
|
||||
plt.figure(figsize=(12, 8))
|
||||
# 语音波形
|
||||
plt.subplot(3, 1, 1)
|
||||
plt.plot(sound_array)
|
||||
plt.title("Waveform")
|
||||
for boundary in x1:
|
||||
plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5)
|
||||
for boundary in x2:
|
||||
plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5)
|
||||
for boundary in x3:
|
||||
plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5)
|
||||
|
||||
# 短时能量
|
||||
plt.subplot(3, 1, 2)
|
||||
plt.plot(frameTime, amp, label="Energy")
|
||||
plt.axhline(y=amp2, color="r", linestyle="--", label="Energy Threshold")
|
||||
plt.legend()
|
||||
plt.title("Short-time Energy")
|
||||
for boundary in x1:
|
||||
plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5)
|
||||
for boundary in x2:
|
||||
plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5)
|
||||
for boundary in x3:
|
||||
plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5)
|
||||
|
||||
# 短时过零率
|
||||
plt.subplot(3, 1, 3)
|
||||
plt.plot(frameTime, zcr, label="Zero Crossing Rate")
|
||||
plt.axhline(y=zcr2, color="r", linestyle="--", label="ZCR Threshold")
|
||||
plt.legend()
|
||||
plt.title("Short-time Zero Crossing Rate")
|
||||
|
||||
# 显示语音端点和清/浊音边界
|
||||
for boundary in x1:
|
||||
plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5)
|
||||
for boundary in x2:
|
||||
plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5)
|
||||
for boundary in x3:
|
||||
plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5)
|
||||
|
||||
plt.tight_layout()
|
||||
plt.show()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
analyze_sound("tang1.wav", FrameLen=128, inc=90)
|
||||
BIN
Lab/Lab2/code/voice.wav
Normal file
BIN
Lab/Lab2/code/voice.wav
Normal file
Binary file not shown.
Reference in New Issue
Block a user