Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed tokenizer and audio processing logic #214

Open
wants to merge 27 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
c53d130
Update dependencies and fix language detection typo
1amageek Oct 3, 2024
2608340
Update AudioEncoder shape access and add tokenizer methods
1amageek Oct 3, 2024
b8029fb
Add `Sendable` conformance to several structs and enums
1amageek Oct 4, 2024
2af7d50
Refactor AudioProcessor to use actor model and async
1amageek Oct 5, 2024
dd1c4d5
Add Sendable conformance to various types and protocols
1amageek Oct 5, 2024
208893d
Update development team and package dependencies
1amageek Oct 5, 2024
9539e8b
Update package version and clean up code formatting
1amageek Oct 5, 2024
769dc29
Refactor audio energy calculations and buffer conversion
1amageek Oct 5, 2024
c8219d3
Refactor calculateRelativeEnergy method for clarity
1amageek Oct 5, 2024
2c0549c
Optimize audio buffer processing with vDSP_mmov
1amageek Oct 5, 2024
22aaa70
Refactor audio sample access methods in AudioProcessor
1amageek Oct 5, 2024
8ceaa0a
Remove unnecessary weak self references in closure
1amageek Oct 5, 2024
4d4233e
Refactor audio processing to use async/await methods
1amageek Oct 5, 2024
f9bcd1d
Use weak self in audio tap closure to prevent retain cycle
1amageek Oct 5, 2024
ea5d853
Log file name in error message for transcriber
1amageek Oct 5, 2024
5909d11
Refactor VADAudioChunker to a struct from a class
1amageek Oct 5, 2024
184b990
Refactor voice activity detection to use protocols
1amageek Oct 5, 2024
933b71b
Add audio converter initialization in resampling process
1amageek Oct 5, 2024
2dbb87f
Refactor AudioProcessor to use SampleRange type
1amageek Oct 6, 2024
368333f
Make AudioProcessing conform to Actor protocol
1amageek Oct 6, 2024
c41fb22
Refactor SegmentSeeker to improve readability and performance
1amageek Oct 6, 2024
621b1f3
Refactor SegmentSeeker to improve clarity and efficiency
1amageek Oct 6, 2024
f646268
Refactor SegmentSeeker to simplify alignment handling
1amageek Oct 6, 2024
db66166
Refactor SegmentSeeker to handle Float16 data type
1amageek Oct 6, 2024
28f34c3
Remove unnecessary comments in SegmentSeeker.swift
1amageek Oct 6, 2024
bb66ae1
Refactor SegmentSeeker for improved clarity and performance
1amageek Oct 6, 2024
8bfdd88
Refactor audio processor deinit and improve memory management
1amageek Oct 23, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Examples/WhisperAX/WhisperAX.xcodeproj/project.pbxproj
Original file line number Diff line number Diff line change
Expand Up @@ -869,7 +869,7 @@
CURRENT_PROJECT_VERSION = 1;
DEAD_CODE_STRIPPING = YES;
DEVELOPMENT_ASSET_PATHS = "\"WhisperAX/Preview Content\"";
DEVELOPMENT_TEAM = PP83DTRKSA;
DEVELOPMENT_TEAM = 88ACA86N96;
ENABLE_HARDENED_RUNTIME = YES;
ENABLE_PREVIEWS = YES;
GENERATE_INFOPLIST_FILE = YES;
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,15 @@
{
"originHash" : "cd17206b47bb810af9459722192530e3838d8e6629a970988e32a432aaa05f6e",
"originHash" : "420a1723357da21f9e31b01403fd3d66df6e400a752d242d05b2c3d5667e3c33",
"pins" : [
{
"identity" : "jinja",
"kind" : "remoteSourceControl",
"location" : "https://github.com/maiqingqiang/Jinja",
"state" : {
"revision" : "b435eb62b0d3d5f34167ec70a128355486981712",
"version" : "1.0.5"
}
},
{
"identity" : "networkimage",
"kind" : "remoteSourceControl",
Expand All @@ -15,26 +24,26 @@
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-argument-parser.git",
"state" : {
"revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
"version" : "1.3.0"
"revision" : "41982a3656a71c768319979febd796c6fd111d5c",
"version" : "1.5.0"
}
},
{
"identity" : "swift-markdown-ui",
"kind" : "remoteSourceControl",
"location" : "https://github.com/gonzalezreal/swift-markdown-ui.git",
"state" : {
"revision" : "ae799d015a5374708f7b4c85f3294c05f2a564e2",
"version" : "2.3.0"
"revision" : "55441810c0f678c78ed7e2ebd46dde89228e02fc",
"version" : "2.4.0"
}
},
{
"identity" : "swift-transformers",
"kind" : "remoteSourceControl",
"location" : "https://github.com/huggingface/swift-transformers.git",
"state" : {
"revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe",
"version" : "0.1.7"
"revision" : "0f2306713d48a75b862026ebb291926793773f52",
"version" : "0.1.12"
}
}
],
Expand Down
12 changes: 7 additions & 5 deletions Examples/WhisperAX/WhisperAX/Views/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -1206,9 +1206,10 @@ struct ContentView: View {
#endif

try? audioProcessor.startRecordingLive(inputDeviceID: deviceId) { _ in
DispatchQueue.main.async {
bufferEnergy = whisperKit?.audioProcessor.relativeEnergy ?? []
bufferSeconds = Double(whisperKit?.audioProcessor.audioSamples.count ?? 0) / Double(WhisperKit.sampleRate)
Task { @MainActor in
bufferEnergy = await whisperKit?.audioProcessor.getRelativeEnergy() ?? []
let audioSamples = await whisperKit?.audioProcessor.getAudioSamples() ?? []
bufferSeconds = Double(audioSamples.count) / Double(WhisperKit.sampleRate)
}
}

Expand Down Expand Up @@ -1406,7 +1407,7 @@ struct ContentView: View {
guard let whisperKit = whisperKit else { return }

// Retrieve the current audio buffer from the audio processor
let currentBuffer = whisperKit.audioProcessor.audioSamples
let currentBuffer = whisperKit.audioProcessor.getAudioSamples()

// Calculate the size and duration of the next buffer segment
let nextBufferSize = currentBuffer.count - lastBufferSize
Expand All @@ -1424,8 +1425,9 @@ struct ContentView: View {
}

if useVAD {
let relativeEnergy = whisperKit.audioProcessor.getRelativeEnergy()
let voiceDetected = AudioProcessor.isVoiceDetected(
in: whisperKit.audioProcessor.relativeEnergy,
in: relativeEnergy,
nextBufferInSeconds: nextBufferSeconds,
silenceThreshold: Float(silenceThreshold)
)
Expand Down
17 changes: 13 additions & 4 deletions Package.resolved
Original file line number Diff line number Diff line change
@@ -1,21 +1,30 @@
{
"pins" : [
{
"identity" : "jinja",
"kind" : "remoteSourceControl",
"location" : "https://github.com/maiqingqiang/Jinja",
"state" : {
"revision" : "4ffa95ce02e013c992287e19e3bbd620b6cc233a",
"version" : "1.0.4"
}
},
{
"identity" : "swift-argument-parser",
"kind" : "remoteSourceControl",
"location" : "https://github.com/apple/swift-argument-parser.git",
"state" : {
"revision" : "c8ed701b513cf5177118a175d85fbbbcd707ab41",
"version" : "1.3.0"
"revision" : "41982a3656a71c768319979febd796c6fd111d5c",
"version" : "1.5.0"
}
},
{
"identity" : "swift-transformers",
"kind" : "remoteSourceControl",
"location" : "https://github.com/huggingface/swift-transformers.git",
"state" : {
"revision" : "74b94211bdc741694ed7e700a1104c72e5ba68fe",
"version" : "0.1.7"
"revision" : "0f2306713d48a75b862026ebb291926793773f52",
"version" : "0.1.12"
}
}
],
Expand Down
4 changes: 2 additions & 2 deletions Package.swift
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@ let package = Package(
),
],
dependencies: [
.package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.7"),
.package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.3.0"),
.package(url: "https://github.com/huggingface/swift-transformers.git", exact: "0.1.12"),
.package(url: "https://github.com/apple/swift-argument-parser.git", exact: "1.5.0"),
],
targets: [
.target(
Expand Down
15 changes: 9 additions & 6 deletions Sources/WhisperKit/Core/Audio/AudioChunker.swift
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,12 @@ public extension AudioChunking {

/// A audio chunker that splits audio into smaller pieces based on voice activity detection
@available(macOS 13, iOS 16, watchOS 10, visionOS 1, *)
open class VADAudioChunker: AudioChunking {
public struct VADAudioChunker: AudioChunking {
/// prevent hallucinations at the end of the clip by stopping up to 1.0s early
private let windowPadding: Int
private let vad: VoiceActivityDetector
private let vad: any VoiceActivityDetectable

public init(windowPadding: Int = 16000, vad: VoiceActivityDetector? = nil) {
public init(windowPadding: Int = 16000, vad: (any VoiceActivityDetectable)? = nil) {
self.windowPadding = windowPadding
self.vad = vad ?? EnergyVAD()
}
Expand Down Expand Up @@ -81,12 +81,12 @@ open class VADAudioChunker: AudioChunking {
// Typically this will be the full audio file, unless seek points are explicitly provided
var startIndex = seekClipStart
while startIndex < seekClipEnd - windowPadding {
let currentFrameLength = startIndex - seekClipStart
if startIndex >= currentFrameLength, startIndex < 0 {
// 配列範囲内にあるかチェック
if startIndex >= audioArray.count || startIndex < 0 {
throw WhisperError.audioProcessingFailed("startIndex is outside the buffer size")
}

// Make sure we still need chunking for this seek clip, otherwise use the original seek clip end
// Adjust the end index based on VAD or maxChunkLength
var endIndex = seekClipEnd
if startIndex + maxChunkLength < endIndex {
// Adjust the end index based on VAD
Expand All @@ -97,6 +97,8 @@ open class VADAudioChunker: AudioChunking {
)
}

// Ensure endIndex is within the array bounds
endIndex = min(endIndex, audioArray.count)
guard endIndex > startIndex else {
break
}
Expand All @@ -108,4 +110,5 @@ open class VADAudioChunker: AudioChunking {
}
return chunkedAudio
}

}
Loading