Skip to content

Commit

Permalink
Update comments
Browse files Browse the repository at this point in the history
  • Loading branch information
mochi-neko committed Jul 5, 2023
1 parent 797dd20 commit bf26565
Show file tree
Hide file tree
Showing 15 changed files with 167 additions and 57 deletions.
13 changes: 7 additions & 6 deletions Assets/Mochineko/VoiceActivityDetection.Samples/VADSample.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ namespace Mochineko.VoiceActivityDetection.Samples
{
/// <summary>
/// A sample of voice activity detection as a component.
/// Input UnityEngine.Microphone and output only log.
/// </summary>
internal sealed class VADSample : MonoBehaviour
{
Expand All @@ -20,16 +21,16 @@ internal sealed class VADSample : MonoBehaviour
private float activationRateThreshold = 0.6f;

[SerializeField]
private float deactivationRateThreshold = 0.4f;
private float inactivationRateThreshold = 0.4f;

[SerializeField]
private float activationIntervalSeconds = 0.5f;

[SerializeField]
private float deactivationIntervalSeconds = 0.5f;
private float inactivationIntervalSeconds = 0.5f;

[SerializeField]
private float maxDurationSeconds = 10f;
private float maxActiveDurationSeconds = 10f;

private IVoiceActivityDetector? vad;

Expand All @@ -41,10 +42,10 @@ private void Start()
maxQueueingTimeSeconds,
activeVolumeThreshold,
activationRateThreshold,
deactivationRateThreshold,
inactivationRateThreshold,
activationIntervalSeconds,
deactivationIntervalSeconds,
maxDurationSeconds);
inactivationIntervalSeconds,
maxActiveDurationSeconds);

vad
.IsActive
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
using Cysharp.Threading.Tasks;
using Mochineko.Relent.Resilience;
using Mochineko.Relent.UncertainResult;
using UniRx;
using Unity.Logging;
using UnityEngine;

namespace Mochineko.VoiceActivityDetection.Samples
{
/// <summary>
/// A sample of voice activity detection as a component.
/// Input UnityEngine.Microphone and output WAV file, then transcribe voice into text by OpenAI/Whisper API.
/// </summary>
internal sealed class VADToWhisperSample : MonoBehaviour, IWaveStreamReceiver
{
[SerializeField]
Expand All @@ -26,16 +29,16 @@ internal sealed class VADToWhisperSample : MonoBehaviour, IWaveStreamReceiver
private float activationRateThreshold = 0.6f;

[SerializeField]
private float deactivationRateThreshold = 0.4f;
private float inactivationRateThreshold = 0.4f;

[SerializeField]
private float activationIntervalSeconds = 0.5f;

[SerializeField]
private float deactivationIntervalSeconds = 0.5f;
private float inactivationIntervalSeconds = 0.5f;

[SerializeField]
private float maxDurationSeconds = 10f;
private float maxActiveDurationSeconds = 10f;

private IVoiceActivityDetector? vad;

Expand All @@ -58,10 +61,10 @@ private void Start()
maxQueueingTimeSeconds,
activeVolumeThreshold,
activationRateThreshold,
deactivationRateThreshold,
inactivationRateThreshold,
activationIntervalSeconds,
deactivationIntervalSeconds,
maxDurationSeconds);
inactivationIntervalSeconds,
maxActiveDurationSeconds);
}

private void OnDestroy()
Expand All @@ -82,7 +85,7 @@ private void Update()
}
}

public void OnReceive(Stream stream)
void IWaveStreamReceiver.OnReceive(Stream stream)
{
Log.Debug("[VAD.Samples] Enqueue wave stream.");

Expand All @@ -91,6 +94,7 @@ public void OnReceive(Stream stream)

private async UniTask TranscribeAsync(Stream stream, CancellationToken cancellationToken)
{
// API key must be set in environment variable.
var apiKey = Environment.GetEnvironmentVariable("OPENAI_API_KEY");
if (string.IsNullOrEmpty(apiKey))
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
namespace Mochineko.VoiceActivityDetection
{
/// <summary>
/// Detects voice activity.
/// Detector of voice activity.
/// </summary>
public interface IVoiceActivityDetector : IDisposable
{
Expand Down
16 changes: 13 additions & 3 deletions Assets/Mochineko/VoiceActivityDetection/IVoiceBuffer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -6,20 +6,30 @@
namespace Mochineko.VoiceActivityDetection
{
/// <summary>
/// Buffers voice data.
/// Buffer of voice data.
/// </summary>
public interface IVoiceBuffer : IDisposable
{
/// <summary>
/// Buffers voice segment.
/// </summary>
/// <param name="segment"></param>
/// <param name="cancellationToken"></param>
/// <param name="segment">Voice segment data to buffer.</param>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns></returns>
UniTask BufferAsync(VoiceSegment segment, CancellationToken cancellationToken);

/// <summary>
/// Called when voice has been active.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns></returns>
UniTask OnActiveAsync(CancellationToken cancellationToken);

/// <summary>
/// Called when voice has been inactive.
/// </summary>
/// <param name="cancellationToken">Cancellation token.</param>
/// <returns></returns>
UniTask OnInactiveAsync(CancellationToken cancellationToken);
}
}
8 changes: 7 additions & 1 deletion Assets/Mochineko/VoiceActivityDetection/IVoiceSource.cs
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,18 @@ namespace Mochineko.VoiceActivityDetection
/// </summary>
public interface IVoiceSource : IDisposable
{
/// <summary>
/// Sampling rate (= frequency) of voice data.
/// </summary>
int SamplingRate { get; }

/// <summary>
/// Channels count of voice data.
/// </summary>
int Channels { get; }

/// <summary>
/// Called when a segment is read.
/// Called when a segment has been read.
/// </summary>
IObservable<VoiceSegment> OnSegmentRead { get; }

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,16 @@

namespace Mochineko.VoiceActivityDetection
{
/// <summary>
/// Receiver of wave stream.
/// </summary>
public interface IWaveStreamReceiver
{
/// <summary>
/// Receives wave stream.
/// Notice that stream instance should be disposed by the receiver.
/// </summary>
/// <param name="stream">Received stream</param>
void OnReceive(Stream stream);
}
}
15 changes: 8 additions & 7 deletions Assets/Mochineko/VoiceActivityDetection/NullVoiceBuffer.cs
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#nullable enable
using System;
using System.Threading;
using Cysharp.Threading.Tasks;

Expand All @@ -9,17 +10,17 @@ namespace Mochineko.VoiceActivityDetection
/// </summary>
public sealed class NullVoiceBuffer : IVoiceBuffer
{
public void Dispose()
{
}

public UniTask BufferAsync(VoiceSegment segment, CancellationToken cancellationToken)
UniTask IVoiceBuffer.BufferAsync(VoiceSegment segment, CancellationToken cancellationToken)
=> UniTask.CompletedTask;

public UniTask OnActiveAsync(CancellationToken cancellationToken)
UniTask IVoiceBuffer.OnActiveAsync(CancellationToken cancellationToken)
=> UniTask.CompletedTask;

public UniTask OnInactiveAsync(CancellationToken cancellationToken)
UniTask IVoiceBuffer.OnInactiveAsync(CancellationToken cancellationToken)
=> UniTask.CompletedTask;

void IDisposable.Dispose()
{
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,12 @@

namespace Mochineko.VoiceActivityDetection
{
/// <summary>
/// Null object of <see cref="IWaveStreamReceiver"/> that do nothing.
/// </summary>
public sealed class NullWaveStreamReceiver : IWaveStreamReceiver
{
public void OnReceive(Stream stream)
void IWaveStreamReceiver.OnReceive(Stream stream)
{
stream.Dispose();
}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#nullable enable
using System;
using System.Diagnostics;
using System.Threading;
using UniRx;
Expand All @@ -8,7 +9,7 @@ namespace Mochineko.VoiceActivityDetection
{
/// <summary>
/// A simple implementation of <see cref="IVoiceActivityDetector"/>.
/// Detects voice activity by using voice segment queue, volume threshold, activation/deactivation rate and deactivation interval.
/// Detects voice activity by using voice segment queue, volume threshold, activation/deactivation rate and interval.
/// </summary>
public sealed class QueueingVoiceActivityDetector : IVoiceActivityDetector
{
Expand All @@ -17,39 +18,51 @@ public sealed class QueueingVoiceActivityDetector : IVoiceActivityDetector
private readonly VoiceSegmentActivityQueue queue;
private readonly float activeVolumeThreshold;
private readonly float activationRateThreshold;
private readonly float deactivationRateThreshold;
private readonly float inactivationRateThreshold;
private readonly float activationIntervalSeconds;
private readonly float deactivationIntervalSeconds;
private readonly float maxDurationSeconds;
private readonly float inactivationIntervalSeconds;
private readonly float maxActiveDurationSeconds;

private readonly CompositeDisposable compositeDisposable = new();
private readonly CancellationTokenSource cancellationTokenSource = new();
private readonly Stopwatch intervalStopwatch = new();
private readonly Stopwatch totalDurationStopwatch = new();

private readonly ReactiveProperty<bool> isActive = new();
public IReadOnlyReactiveProperty<bool> IsActive => isActive;
IReadOnlyReactiveProperty<bool> IVoiceActivityDetector.IsActive => isActive;

/// <summary>
/// Create a new instance of <see cref="QueueingVoiceActivityDetector"/>.
/// </summary>
/// <param name="source">Source of voice data.</param>
/// <param name="buffer">Buffer of voice data.</param>
/// <param name="maxQueueingTimeSeconds">Max time(sec) to queue voice segment.</param>
/// <param name="activeVolumeThreshold">Threshold of active voice volume by root mean square.</param>
/// <param name="activationRateThreshold">Threshold of active rate in queue that changes into active state.</param>
/// <param name="inactivationRateThreshold">Threshold of active rate in queue that changes into inactive state.</param>
/// <param name="activationIntervalSeconds">Interval time(sec) to change from inactive state to active state.</param>
/// <param name="inactivationIntervalSeconds">Interval time(sec) to change from active state to inactive state.</param>
/// <param name="maxActiveDurationSeconds">Max time(sec) of active state.</param>
public QueueingVoiceActivityDetector(
IVoiceSource source,
IVoiceBuffer buffer,
float maxQueueingTimeSeconds,
float activeVolumeThreshold,
float activationRateThreshold,
float deactivationRateThreshold,
float inactivationRateThreshold,
float activationIntervalSeconds,
float deactivationIntervalSeconds,
float maxDurationSeconds)
float inactivationIntervalSeconds,
float maxActiveDurationSeconds)
{
this.source = source;
this.buffer = buffer;
this.queue = new VoiceSegmentActivityQueue(maxQueueingTimeSeconds);
this.activeVolumeThreshold = activeVolumeThreshold;
this.activationRateThreshold = activationRateThreshold;
this.deactivationRateThreshold = deactivationRateThreshold;
this.inactivationRateThreshold = inactivationRateThreshold;
this.activationIntervalSeconds = activationIntervalSeconds;
this.deactivationIntervalSeconds = deactivationIntervalSeconds;
this.maxDurationSeconds = maxDurationSeconds;
this.inactivationIntervalSeconds = inactivationIntervalSeconds;
this.maxActiveDurationSeconds = maxActiveDurationSeconds;

this.source
.OnSegmentRead
Expand All @@ -59,7 +72,7 @@ public QueueingVoiceActivityDetector(
this.intervalStopwatch.Start();
}

public void Dispose()
void IDisposable.Dispose()
{
this.cancellationTokenSource.Dispose();
this.compositeDisposable.Dispose();
Expand All @@ -69,7 +82,7 @@ public void Dispose()
this.totalDurationStopwatch.Stop();
}

public void Update()
void IVoiceActivityDetector.Update()
{
this.source.Update();
}
Expand Down Expand Up @@ -104,11 +117,11 @@ private async void OnSegmentReadAsync(VoiceSegment segment)
}
else if (
isActive.Value
&& (totalDurationStopwatch.ElapsedMilliseconds >= maxDurationSeconds * 1000
|| (activeRate <= deactivationRateThreshold
&& intervalStopwatch.ElapsedMilliseconds >= deactivationIntervalSeconds * 1000)))
&& (totalDurationStopwatch.ElapsedMilliseconds >= maxActiveDurationSeconds * 1000
|| (activeRate <= inactivationRateThreshold
&& intervalStopwatch.ElapsedMilliseconds >= inactivationIntervalSeconds * 1000)))
{
Log.Debug("[VAD] Deactivated.");
Log.Debug("[VAD] Inactivated.");
await this.buffer.OnInactiveAsync(this.cancellationTokenSource.Token);
this.isActive.Value = false;
intervalStopwatch.Restart();
Expand Down
17 changes: 17 additions & 0 deletions Assets/Mochineko/VoiceActivityDetection/UnityMicrophoneProxy.cs
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,18 @@ namespace Mochineko.VoiceActivityDetection
public sealed class UnityMicrophoneProxy : IDisposable
{
private readonly string? deviceName;

/// <summary>
/// AudioClip instance of microphone recording.
/// </summary>
public AudioClip AudioClip { get; }

/// <summary>
/// Creates a new instance of <see cref="UnityMicrophoneProxy"/>.
/// </summary>
/// <param name="deviceName">Microphone device name to record, `null` specifies OS default device.</param>
/// <param name="loopLengthSeconds">Loop time(sec) of AudioClip.</param>
/// <param name="frequency">Frequency (= sampling rate) of recording.</param>
public UnityMicrophoneProxy(
string? deviceName = null,
int loopLengthSeconds = 1,
Expand All @@ -22,12 +32,19 @@ public UnityMicrophoneProxy(
this.AudioClip = Microphone.Start(this.deviceName, loop: true, loopLengthSeconds, frequency);
}

/// <summary>
/// Disposes this instance.
/// </summary>
public void Dispose()
{
Microphone.End(this.deviceName);
UnityEngine.Object.Destroy(this.AudioClip);
}

/// <summary>
/// Get current sample position of microphone recording in looped AudioClip.
/// </summary>
/// <returns></returns>
public int GetSamplePosition()
=> Microphone.GetPosition(this.deviceName);
}
Expand Down
Loading

0 comments on commit bf26565

Please sign in to comment.