first commit

2024-09-05 12:45:40 +08:00
commit e148adef80
98 changed files with 2808 additions and 0 deletions
--- a/Lab/Lab2/code/tang1.wav
+++ b/Lab/Lab2/code/tang1.wav
--- a/Lab/Lab2/code/test.ipynb
+++ b/Lab/Lab2/code/test.ipynb
--- a/Lab/Lab2/code/test.py
+++ b/Lab/Lab2/code/test.py
@@ -0,0 +1,179 @@
+from typing import Optional
+import scipy.io.wavfile as wav
+import numpy as np
+import matplotlib.pyplot as plt
+import ipdb
+
+
+def hamming(frame_length: int) -> np.ndarray:
+    # frame_length - 窗长
+
+    n = np.arange(frame_length)
+    h = 0.54 - 0.4 * np.cos(2 * np.pi * n / (frame_length - 1))
+    return h
+
+
+def delta_sgn(x: np.ndarray) -> np.ndarray:
+    # x - 语音信号
+
+    sound = x
+    threshold = np.max(np.abs(sound)) / 20
+    negative_sound = sound + threshold
+    negative_sound -= np.abs(negative_sound)
+    positive_sound = sound - threshold
+    positive_sound += np.abs(positive_sound)
+    sound = negative_sound + positive_sound
+    return np.sign(sound)
+
+
+def ampf(
+    x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
+) -> np.ndarray:
+    # x - 语音时域信号
+    # FrameLen - 每一帧的长度
+    # inc - 步长
+
+    frames = []
+    for i in range(0, len(x) - FrameLen, inc):
+        frame = x[i : i + FrameLen]
+        frames.append(frame)
+    frames = np.array(frames)
+
+    h = hamming(frame_length=FrameLen)  # 海明窗
+    amp = np.dot(frames**2, h.T**2).T / FrameLen
+
+    return amp
+
+
+def zcrf(
+    x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
+) -> np.ndarray:
+    # x - 语音时域信号
+    # FrameLen - 每一帧的长度
+    # inc - 步长
+
+    sound = x
+    sgn_sound = np.sign(sound)
+
+    dif_sound = np.abs(sgn_sound[1:] - sgn_sound[:-1])
+    h = np.ones((FrameLen,)) / (2 * FrameLen)
+
+    frames = []
+    for i in range(0, len(dif_sound) - FrameLen, inc):
+        frame = dif_sound[i : i + FrameLen]
+        frames.append(frame)
+
+    frames = np.array(frames)
+    zcr = np.dot(frames, h.T).T
+    return zcr
+
+
+def zcrf_delta(
+    x: np.ndarray, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
+) -> np.ndarray:
+    # x - 语音时域信号
+    # FrameLen - 每一帧的长度
+    # inc - 步长
+
+    sound = x
+    sgn_sound = delta_sgn(sound)
+
+    dif_sound = np.abs(sgn_sound[1:] - sgn_sound[:-1])
+    h = np.ones((FrameLen,)) / (2 * FrameLen)
+
+    frames = []
+    for i in range(0, len(dif_sound) - FrameLen, inc):
+        frame = dif_sound[i : i + FrameLen]
+        frames.append(frame)
+
+    frames = np.array(frames)
+    zcr = np.dot(frames, h.T).T
+    return zcr
+
+
+def analyze_sound(
+    filename: str, FrameLen: Optional[int] = 128, inc: Optional[int] = 90
+) -> None:
+    sr, sound_array = wav.read(filename)
+    sound_array = sound_array.T[0, :] if sound_array.ndim != 1 else sound_array
+    sound_array = sound_array / np.max(np.abs(sound_array))  # 归一化
+
+    amp = ampf(sound_array, FrameLen, inc)
+    zcr = zcrf_delta(sound_array, FrameLen, inc)
+
+    rescale_rate = len(sound_array) / amp.shape[0]
+    frameTime = np.arange(len(amp)) * rescale_rate
+
+    # 边界检测
+    x1 = []
+    x2 = []
+    x3 = []
+    amp2 = np.min(amp) + (np.max(amp) - np.min(amp)) / 20
+    zcr2 = np.min(zcr) + (np.max(zcr) - np.min(zcr)) / 18
+
+    threshold_len = 6
+    state = 1
+    for i in range(threshold_len, len(amp) - threshold_len):
+        if state == 1:
+            if np.all(zcr[i : i + threshold_len] > zcr2):
+                x1.append(i * rescale_rate)
+                state = 2
+        elif state == 2:
+            if np.all(amp[i : i + threshold_len] > amp2):
+                x3.append(i * rescale_rate)
+                state = 3
+        if (
+            state != 1
+            and np.all(amp[i : i + threshold_len] < amp2)
+            and np.all(zcr[i : i + threshold_len] < zcr2)
+        ):
+            x2.append(i * rescale_rate)
+            state = 1
+
+    # 绘制语音波形、短时能量、短时过零率
+    plt.figure(figsize=(12, 8))
+    # 语音波形
+    plt.subplot(3, 1, 1)
+    plt.plot(sound_array)
+    plt.title("Waveform")
+    for boundary in x1:
+        plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5)
+    for boundary in x2:
+        plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5)
+    for boundary in x3:
+        plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5)
+
+    # 短时能量
+    plt.subplot(3, 1, 2)
+    plt.plot(frameTime, amp, label="Energy")
+    plt.axhline(y=amp2, color="r", linestyle="--", label="Energy Threshold")
+    plt.legend()
+    plt.title("Short-time Energy")
+    for boundary in x1:
+        plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5)
+    for boundary in x2:
+        plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5)
+    for boundary in x3:
+        plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5)
+
+    # 短时过零率
+    plt.subplot(3, 1, 3)
+    plt.plot(frameTime, zcr, label="Zero Crossing Rate")
+    plt.axhline(y=zcr2, color="r", linestyle="--", label="ZCR Threshold")
+    plt.legend()
+    plt.title("Short-time Zero Crossing Rate")
+
+    # 显示语音端点和清/浊音边界
+    for boundary in x1:
+        plt.axvline(x=boundary, color="r", linestyle="--", linewidth=0.5)
+    for boundary in x2:
+        plt.axvline(x=boundary, color="b", linestyle="--", linewidth=0.5)
+    for boundary in x3:
+        plt.axvline(x=boundary, color="g", linestyle="--", linewidth=0.5)
+
+    plt.tight_layout()
+    plt.show()
+
+
+if __name__ == "__main__":
+    analyze_sound("tang1.wav", FrameLen=128, inc=90)
--- a/Lab/Lab2/code/voice.wav
+++ b/Lab/Lab2/code/voice.wav