From 4e31607c7d779539ff54cd646e49f7ea36e78ad3 Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Thu, 3 Oct 2024 12:21:19 -0400 Subject: [PATCH 1/8] Detect UTF-8 for vtt/srt, error if encoded otherwise --- srt.go | 5 +++++ webvtt.go | 5 +++++ 2 files changed, 10 insertions(+) diff --git a/srt.go b/srt.go index e7f016c..ae66379 100644 --- a/srt.go +++ b/srt.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" ) // Constants @@ -42,6 +43,10 @@ func ReadFromSRT(i io.Reader) (o *Subtitles, err error) { for scanner.Scan() { // Fetch line line = strings.TrimSpace(scanner.Text()) + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: bytes are not valid utf-8") + return + } lineNum++ // Remove BOM header diff --git a/webvtt.go b/webvtt.go index ec0e7e3..5342ab1 100644 --- a/webvtt.go +++ b/webvtt.go @@ -11,6 +11,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" "golang.org/x/net/html" ) @@ -127,6 +128,10 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { for scanner.Scan() { lineNum++ line = scanner.Text() + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: bytes are not valid utf-8") + return + } line = strings.TrimPrefix(line, string(BytesBOM)) if fs := strings.Fields(line); len(fs) > 0 && fs[0] == "WEBVTT" { break From 1fdf7b65ce337d316c25899af10405562edfa7ac Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Fri, 4 Oct 2024 11:28:09 -0400 Subject: [PATCH 2/8] Switch to isValidUTF8Reader --- srt.go | 9 ++++----- subtitles.go | 10 ++++++++++ subtitles_internal_test.go | 11 +++++++++++ webvtt.go | 9 ++++----- 4 files changed, 29 insertions(+), 10 deletions(-) diff --git a/srt.go b/srt.go index ae66379..e23f23b 100644 --- a/srt.go +++ b/srt.go @@ -7,7 +7,6 @@ import ( "strconv" "strings" "time" - "unicode/utf8" ) // Constants @@ -34,6 +33,10 @@ func parseDurationSRT(i string) (d time.Duration, err error) { func ReadFromSRT(i io.Reader) (o *Subtitles, err error) { // Init o = NewSubtitles() + if !isValidUTF8Reader(i) { + err = fmt.Errorf("astisub: bytes are not valid utf-8") + return + } var scanner = bufio.NewScanner(i) // Scan @@ -43,10 +46,6 @@ func ReadFromSRT(i io.Reader) (o *Subtitles, err error) { for scanner.Scan() { // Fetch line line = strings.TrimSpace(scanner.Text()) - if !utf8.ValidString(line) { - err = fmt.Errorf("astisub: bytes are not valid utf-8") - return - } lineNum++ // Remove BOM header diff --git a/subtitles.go b/subtitles.go index e917931..dbaed77 100644 --- a/subtitles.go +++ b/subtitles.go @@ -1,8 +1,10 @@ package astisub import ( + "bufio" "errors" "fmt" + "io" "math" "os" "path/filepath" @@ -10,6 +12,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" "github.com/asticode/go-astikit" ) @@ -835,3 +838,10 @@ func appendStringToBytesWithNewLine(i []byte, s string) (o []byte) { o = append(o, bytesLineSeparator...) return } + +// isValidUTF8Reader will peek the first 512 bytes to determine whether or not +// they are valid UTF-8, returning true if valid. +func isValidUTF8Reader(i io.Reader) bool { + testBytes, _ := bufio.NewReader(i).Peek(512) + return utf8.Valid(testBytes) +} diff --git a/subtitles_internal_test.go b/subtitles_internal_test.go index df4d2f7..9ba5000 100644 --- a/subtitles_internal_test.go +++ b/subtitles_internal_test.go @@ -1,6 +1,7 @@ package astisub import ( + "bytes" "testing" "time" @@ -66,3 +67,13 @@ func TestFormatDuration(t *testing.T) { s = formatDuration(12*time.Hour+34*time.Minute+56*time.Second+999*time.Millisecond, ",", 2) assert.Equal(t, "12:34:56,99", s) } + +func TestIsValidUTF8Reader(t *testing.T) { + utf8Reader := bytes.NewBuffer([]byte("abc123")) + valid := isValidUTF8Reader(utf8Reader) + assert.True(t, valid) + + utf16Reader := bytes.NewBuffer([]byte("\xff\xfea\x00b\x00c\x001\x002\x003\x00")) + valid = isValidUTF8Reader(utf16Reader) + assert.True(t, !valid) +} diff --git a/webvtt.go b/webvtt.go index 5342ab1..561b0bb 100644 --- a/webvtt.go +++ b/webvtt.go @@ -11,7 +11,6 @@ import ( "strconv" "strings" "time" - "unicode/utf8" "golang.org/x/net/html" ) @@ -120,6 +119,10 @@ func parseWebVTTTimestampMap(line string) (timestampMap *WebVTTTimestampMap, err func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { // Init o = NewSubtitles() + if !isValidUTF8Reader(i) { + err = fmt.Errorf("astisub: bytes are not valid utf-8") + return + } var scanner = bufio.NewScanner(i) var line string var lineNum int @@ -128,10 +131,6 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { for scanner.Scan() { lineNum++ line = scanner.Text() - if !utf8.ValidString(line) { - err = fmt.Errorf("astisub: bytes are not valid utf-8") - return - } line = strings.TrimPrefix(line, string(BytesBOM)) if fs := strings.Fields(line); len(fs) > 0 && fs[0] == "WEBVTT" { break From 0e049189716d5495042a8a1c3f0200e87677572c Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Sun, 6 Oct 2024 22:08:25 -0400 Subject: [PATCH 3/8] Remove isValidUTF8Reader function --- subtitles.go | 10 ---------- subtitles_internal_test.go | 11 ----------- 2 files changed, 21 deletions(-) diff --git a/subtitles.go b/subtitles.go index dbaed77..e917931 100644 --- a/subtitles.go +++ b/subtitles.go @@ -1,10 +1,8 @@ package astisub import ( - "bufio" "errors" "fmt" - "io" "math" "os" "path/filepath" @@ -12,7 +10,6 @@ import ( "strconv" "strings" "time" - "unicode/utf8" "github.com/asticode/go-astikit" ) @@ -838,10 +835,3 @@ func appendStringToBytesWithNewLine(i []byte, s string) (o []byte) { o = append(o, bytesLineSeparator...) return } - -// isValidUTF8Reader will peek the first 512 bytes to determine whether or not -// they are valid UTF-8, returning true if valid. -func isValidUTF8Reader(i io.Reader) bool { - testBytes, _ := bufio.NewReader(i).Peek(512) - return utf8.Valid(testBytes) -} diff --git a/subtitles_internal_test.go b/subtitles_internal_test.go index 9ba5000..df4d2f7 100644 --- a/subtitles_internal_test.go +++ b/subtitles_internal_test.go @@ -1,7 +1,6 @@ package astisub import ( - "bytes" "testing" "time" @@ -67,13 +66,3 @@ func TestFormatDuration(t *testing.T) { s = formatDuration(12*time.Hour+34*time.Minute+56*time.Second+999*time.Millisecond, ",", 2) assert.Equal(t, "12:34:56,99", s) } - -func TestIsValidUTF8Reader(t *testing.T) { - utf8Reader := bytes.NewBuffer([]byte("abc123")) - valid := isValidUTF8Reader(utf8Reader) - assert.True(t, valid) - - utf16Reader := bytes.NewBuffer([]byte("\xff\xfea\x00b\x00c\x001\x002\x003\x00")) - valid = isValidUTF8Reader(utf16Reader) - assert.True(t, !valid) -} From 5ea7c5a344c619cd1bfb7c60918a690a1f1c0151 Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Sun, 6 Oct 2024 22:09:11 -0400 Subject: [PATCH 4/8] Enforce valid UTF8 on each SRT and WebVTT line --- srt.go | 9 +++++---- webvtt.go | 13 +++++++++---- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/srt.go b/srt.go index e23f23b..58f77ce 100644 --- a/srt.go +++ b/srt.go @@ -7,6 +7,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" ) // Constants @@ -33,10 +34,6 @@ func parseDurationSRT(i string) (d time.Duration, err error) { func ReadFromSRT(i io.Reader) (o *Subtitles, err error) { // Init o = NewSubtitles() - if !isValidUTF8Reader(i) { - err = fmt.Errorf("astisub: bytes are not valid utf-8") - return - } var scanner = bufio.NewScanner(i) // Scan @@ -47,6 +44,10 @@ func ReadFromSRT(i io.Reader) (o *Subtitles, err error) { // Fetch line line = strings.TrimSpace(scanner.Text()) lineNum++ + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: line %d is not valid utf-8", lineNum) + return + } // Remove BOM header if lineNum == 1 { diff --git a/webvtt.go b/webvtt.go index 561b0bb..89672fe 100644 --- a/webvtt.go +++ b/webvtt.go @@ -11,6 +11,7 @@ import ( "strconv" "strings" "time" + "unicode/utf8" "golang.org/x/net/html" ) @@ -119,10 +120,6 @@ func parseWebVTTTimestampMap(line string) (timestampMap *WebVTTTimestampMap, err func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { // Init o = NewSubtitles() - if !isValidUTF8Reader(i) { - err = fmt.Errorf("astisub: bytes are not valid utf-8") - return - } var scanner = bufio.NewScanner(i) var line string var lineNum int @@ -132,6 +129,10 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { lineNum++ line = scanner.Text() line = strings.TrimPrefix(line, string(BytesBOM)) + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: line %d is not valid utf-8", lineNum) + return + } if fs := strings.Fields(line); len(fs) > 0 && fs[0] == "WEBVTT" { break } @@ -148,6 +149,10 @@ func ReadFromWebVTT(i io.Reader) (o *Subtitles, err error) { // Fetch line line = strings.TrimSpace(scanner.Text()) lineNum++ + if !utf8.ValidString(line) { + err = fmt.Errorf("astisub: line %d is not valid utf-8", lineNum) + return + } switch { // Comment From f75c8db02d55960eadab7bce4eab4841e4dd8a33 Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Sun, 6 Oct 2024 22:09:22 -0400 Subject: [PATCH 5/8] Add benchmark tests for webvtt and srt --- srt_test.go | 6 ++++++ webvtt_test.go | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/srt_test.go b/srt_test.go index f44fbbd..29d81cc 100644 --- a/srt_test.go +++ b/srt_test.go @@ -46,3 +46,9 @@ func TestSRTMissingSequence(t *testing.T) { assert.NoError(t, err) assert.Equal(t, string(c), w.String()) } + +func BenchmarkOpenSRT(b *testing.B) { + for i := 0; i < b.N; i++ { + astisub.OpenFile("./testdata/example-in.srt") + } +} diff --git a/webvtt_test.go b/webvtt_test.go index 009d6b4..e0492a9 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -227,3 +227,9 @@ func TestWebVTTTags(t *testing.T) { Text with a <00:06:30.000>timestamp in the middle `, b.String()) } + +func BenchmarkOpenWebVTT(b *testing.B) { + for i := 0; i < b.N; i++ { + astisub.OpenFile("./testdata/example-in.vtt") + } +} From 9f1c78bb969ed920948f7cbb118b751cf3aa67dc Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Mon, 7 Oct 2024 10:14:27 -0400 Subject: [PATCH 6/8] Update benchmark test --- srt_test.go | 5 ++++- webvtt_test.go | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/srt_test.go b/srt_test.go index 29d81cc..910e6d6 100644 --- a/srt_test.go +++ b/srt_test.go @@ -3,6 +3,7 @@ package astisub_test import ( "bytes" "io/ioutil" + "os" "testing" "github.com/asticode/go-astisub" @@ -48,7 +49,9 @@ func TestSRTMissingSequence(t *testing.T) { } func BenchmarkOpenSRT(b *testing.B) { + f, _ := os.Open("./testdata/example-in.srt") + defer f.Close() for i := 0; i < b.N; i++ { - astisub.OpenFile("./testdata/example-in.srt") + astisub.ReadFromSRT(f) } } diff --git a/webvtt_test.go b/webvtt_test.go index e0492a9..9a271e3 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -3,6 +3,7 @@ package astisub_test import ( "bytes" "io/ioutil" + "os" "strings" "testing" "time" @@ -229,7 +230,9 @@ Text with a <00:06:30.000>timestamp in the middle } func BenchmarkOpenWebVTT(b *testing.B) { + f, _ := os.Open("./testdata/example-in.vtt") + defer f.Close() for i := 0; i < b.N; i++ { - astisub.OpenFile("./testdata/example-in.vtt") + astisub.ReadFromWebVTT(f) } } From aca49fabfa083483ea8011ec7a7f74f6fefc935b Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Mon, 7 Oct 2024 10:42:37 -0400 Subject: [PATCH 7/8] Add non-utf-8 tests and files --- srt_test.go | 5 +++++ testdata/example-in-non-utf8.srt | Bin 0 -> 734 bytes testdata/example-in-non-utf8.vtt | Bin 0 -> 3584 bytes webvtt_test.go | 5 +++++ 4 files changed, 10 insertions(+) create mode 100644 testdata/example-in-non-utf8.srt create mode 100644 testdata/example-in-non-utf8.vtt diff --git a/srt_test.go b/srt_test.go index 910e6d6..3dbf1eb 100644 --- a/srt_test.go +++ b/srt_test.go @@ -48,6 +48,11 @@ func TestSRTMissingSequence(t *testing.T) { assert.Equal(t, string(c), w.String()) } +func TestNonUTF8SRT(t *testing.T) { + _, err := astisub.OpenFile("./testdata/example-in-non-utf8.srt") + assert.Error(t, err) +} + func BenchmarkOpenSRT(b *testing.B) { f, _ := os.Open("./testdata/example-in.srt") defer f.Close() diff --git a/testdata/example-in-non-utf8.srt b/testdata/example-in-non-utf8.srt new file mode 100644 index 0000000000000000000000000000000000000000..41fea028837824531c7e57077e9bc2edf280926b GIT binary patch literal 734 zcmZ{iJ5R$v5QL}puW$!Q0pUlCp#ag*Km!t_OkyjJ$V*AAD1RRKb`OgPf=;&YZf<90 z=luDRDAFb4h9}{j=~}VQ8Mk~7ksQ3oPEN(CD%Fe?XVz*p*GMx>wa`Ro;E~b;-qhK1 zuP%?U2mA^=p)S^&JIT?qxuJzSi8XK}b!6a3pLwK5-Rp^Sksh?vj=Bmr72h2$W1fYt zT6f&AUh~{>KBD8F1We03T<|Wa`<+9;`{3pw(u*2&Hk@41|4?IB3p))?){LoVQ&VGV zmR^^tu^ErC<^QOq?j}`9-7`7Y=*(Qv@_u){5z1jVX!QSRlw_O1biAXCZVfM}dv(SN z?+cx1OU1}FZ9M_EJC51!`ri1=)=NBeT^iG2mq8)i=&?*Q)O%zQ{;e@wa}sUQv774B b-MhNEuoP<2-h!I5Df9R{oh3KD_rLH9l)7He literal 0 HcmV?d00001 diff --git a/testdata/example-in-non-utf8.vtt b/testdata/example-in-non-utf8.vtt new file mode 100644 index 0000000000000000000000000000000000000000..1f230b15b0b761f5c4a688b0fec0f14b34b43b26 GIT binary patch literal 3584 zcmeH}TW=Ck5Xa|vKgGT@(xh$5rHU9dnl$mjOJW-vAGvIi6j%a_h{msO{r%^#utgwE zd@>;?%bqi5=0Eox?tkCdrMyV^+sf&{CVg5rdqdp@aT0+_$V3@cr6A z=gEAEx7tnMr5MIlfu>JH}n0_PB@EWxa3|qML0dY&sNmqmHo=^ClGt^q~J<#AB-ju z4xYv~cZzb~w3?mUQNgw;v?^!u<`^w-lquz|>HH^F<{c!zftG+d_WKNPUD^Sw!szGzuKEy`gogzt@~|`h(P2qXz*Jm&Q*Mj$?K`T?vfrlKm6u=Q@Di~xbljX{z}2$iu+b`uj-&}dL8bcabHGG-CwMyMxho}HL^D2 z3H6#+%oJJ+xeV!+$D)T-4R#6gqDt-uC&m7PDvRvBz2@uIVD}cZIXvnf`8)S0C`LM| z?qZLO?0NC0tIT&a`BgrIy$N#ZP!utDy`ykWH*l3^^836 zShu0*>#cf>&^ZD{H52@t`c7wccJ8N;@;*>hg9d$Zc3Dux!kR~?5y I|9$@b0X^^;D*ylh literal 0 HcmV?d00001 diff --git a/webvtt_test.go b/webvtt_test.go index 9a271e3..f7a5ec2 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -49,6 +49,11 @@ func TestBroken1WebVTT(t *testing.T) { assert.Nil(t, err) } +func TestNonUTF8WebVTT(t *testing.T) { + _, err := astisub.OpenFile("./testdata/example-in-non-utf8.vtt") + assert.Error(t, err) +} + func TestWebVTTWithVoiceName(t *testing.T) { testData := `WEBVTT From fb672a9ea6a893a3b6b70ab7eaa6c689575ed740 Mon Sep 17 00:00:00 2001 From: Nick Brandt Date: Tue, 8 Oct 2024 09:45:48 -0400 Subject: [PATCH 8/8] Remove benchmarks --- srt_test.go | 9 --------- webvtt_test.go | 9 --------- 2 files changed, 18 deletions(-) diff --git a/srt_test.go b/srt_test.go index 3dbf1eb..611c3fc 100644 --- a/srt_test.go +++ b/srt_test.go @@ -3,7 +3,6 @@ package astisub_test import ( "bytes" "io/ioutil" - "os" "testing" "github.com/asticode/go-astisub" @@ -52,11 +51,3 @@ func TestNonUTF8SRT(t *testing.T) { _, err := astisub.OpenFile("./testdata/example-in-non-utf8.srt") assert.Error(t, err) } - -func BenchmarkOpenSRT(b *testing.B) { - f, _ := os.Open("./testdata/example-in.srt") - defer f.Close() - for i := 0; i < b.N; i++ { - astisub.ReadFromSRT(f) - } -} diff --git a/webvtt_test.go b/webvtt_test.go index f7a5ec2..8ab1fd9 100644 --- a/webvtt_test.go +++ b/webvtt_test.go @@ -3,7 +3,6 @@ package astisub_test import ( "bytes" "io/ioutil" - "os" "strings" "testing" "time" @@ -233,11 +232,3 @@ func TestWebVTTTags(t *testing.T) { Text with a <00:06:30.000>timestamp in the middle `, b.String()) } - -func BenchmarkOpenWebVTT(b *testing.B) { - f, _ := os.Open("./testdata/example-in.vtt") - defer f.Close() - for i := 0; i < b.N; i++ { - astisub.ReadFromWebVTT(f) - } -}