Skip to content

Commit

Permalink
Converters to/from markdown (#31)
Browse files Browse the repository at this point in the history
* Use the AST to parse markdown documents and turn them into blocks so
they can potentially be rendered as notebooks.

* Add some test utilities to define common comparers
  • Loading branch information
jlewi authored Apr 8, 2024
1 parent 69bd015 commit b9194cd
Show file tree
Hide file tree
Showing 9 changed files with 346 additions and 4 deletions.
8 changes: 6 additions & 2 deletions app/go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,13 @@ go 1.22.1
replace github.com/jlewi/foyle/protos/go => ../protos/go

require (
github.com/Kunde21/markdownfmt/v3 v3.1.0
github.com/gin-contrib/cors v1.7.1
github.com/gin-gonic/gin v1.9.1
github.com/go-cmd/cmd v1.4.1
github.com/go-logr/logr v1.3.0
github.com/go-logr/zapr v1.3.0
github.com/google/go-cmp v0.6.0
github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1
github.com/jlewi/foyle/protos/go v0.0.0-00010101000000-000000000000
github.com/jlewi/hydros v0.0.6
Expand All @@ -17,6 +20,7 @@ require (
github.com/spf13/cobra v1.8.0
github.com/spf13/viper v1.18.2
github.com/timtadh/lexmachine v0.2.3
github.com/yuin/goldmark v1.4.13
go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1
go.uber.org/zap v1.27.0
google.golang.org/grpc v1.62.1
Expand Down Expand Up @@ -55,7 +59,6 @@ require (
github.com/gabriel-vasile/mimetype v1.4.3 // indirect
github.com/ghodss/yaml v1.0.0 // indirect
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-cmd/cmd v1.4.1 // indirect
github.com/go-errors/errors v1.0.1 // indirect
github.com/go-git/gcfg v1.5.0 // indirect
github.com/go-git/go-billy/v5 v5.4.1 // indirect
Expand All @@ -72,7 +75,6 @@ require (
github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
github.com/golang/protobuf v1.5.3 // indirect
github.com/google/gnostic v0.6.9 // indirect
github.com/google/go-cmp v0.6.0 // indirect
github.com/google/go-containerregistry v0.18.0 // indirect
github.com/google/gofuzz v1.2.0 // indirect
github.com/google/s2a-go v0.1.7 // indirect
Expand All @@ -93,6 +95,7 @@ require (
github.com/mailru/easyjson v0.7.7 // indirect
github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 // indirect
github.com/mattn/go-isatty v0.0.20 // indirect
github.com/mattn/go-runewidth v0.0.13 // indirect
github.com/mitchellh/go-homedir v1.1.0 // indirect
github.com/mitchellh/mapstructure v1.5.0 // indirect
github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
Expand All @@ -102,6 +105,7 @@ require (
github.com/opencontainers/image-spec v1.1.0-rc3 // indirect
github.com/pelletier/go-toml/v2 v2.2.0 // indirect
github.com/pjbgf/sha1cd v0.3.0 // indirect
github.com/rivo/uniseg v0.4.2 // indirect
github.com/sagikazarmark/locafero v0.4.0 // indirect
github.com/sagikazarmark/slog-shim v0.1.0 // indirect
github.com/sergi/go-diff v1.2.0 // indirect
Expand Down
8 changes: 8 additions & 0 deletions app/go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ cloud.google.com/go/storage v1.36.0 h1:P0mOkAcaJxhCTvAkMhxMfrTKiNcub4YmmPBtlhAyT
cloud.google.com/go/storage v1.36.0/go.mod h1:M6M/3V/D3KpzMTJyPOR/HU6n2Si5QdaXYEsng2xgOs8=
github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
github.com/Kunde21/markdownfmt/v3 v3.1.0 h1:KiZu9LKs+wFFBQKhrZJrFZwtLnCCWJahL+S+E/3VnM0=
github.com/Kunde21/markdownfmt/v3 v3.1.0/go.mod h1:tPXN1RTyOzJwhfHoon9wUr4HGYmWgVxSQN6VBJDkrVc=
github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY=
github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
Expand Down Expand Up @@ -253,6 +255,8 @@ github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 h1:JAEbJn3j/FrhdWA9jW8
github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2/go.mod h1:0KeJpeMD6o+O4hW7qJOT7vyQPKrWmj26uf5wMc/IiIs=
github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
Expand Down Expand Up @@ -280,6 +284,9 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
github.com/rivo/uniseg v0.4.2 h1:YwD0ulJSJytLpiaWua0sBDusfsCZohxjxzVTYjwxfV8=
github.com/rivo/uniseg v0.4.2/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
Expand Down Expand Up @@ -350,6 +357,7 @@ github.com/xlab/treeprint v1.1.0 h1:G/1DjNkPpfZCFt9CSh6b5/nY4VimlbHF3Rh4obvtzDk=
github.com/xlab/treeprint v1.1.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0=
github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE=
github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=
Expand Down
10 changes: 10 additions & 0 deletions app/pkg/docs/const.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
package docs

const (
BASHLANG = "bash"
// OUTPUTLANG is the language to give to output code blocks.
// We want to potentially distinguish output from code blocks because output blocks are nested inside blocks
// in notebooks. Therefore if we want to be able to convert a markdown document into a document with blocks
// then having a unique language for output blocks helps us identify them and properly reencode them.
OUTPUTLANG = "output"
)
147 changes: 147 additions & 0 deletions app/pkg/docs/converters.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
package docs

import (
"strings"

markdownfmt "github.com/Kunde21/markdownfmt/v3/markdown"
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
"github.com/yuin/goldmark"
"github.com/yuin/goldmark/ast"
"github.com/yuin/goldmark/text"
)

// BlockToMarkdown converts a block to markdown
func BlockToMarkdown(block *v1alpha1.Block) string {
sb := strings.Builder{}

switch block.GetKind() {
case v1alpha1.BlockKind_CODE:
// Code just gets written as a code block
sb.WriteString("```" + BASHLANG + "\n")
sb.WriteString(block.GetContents())
sb.WriteString("\n```\n")
default:
// Otherwise assume its a markdown block
sb.WriteString(block.GetContents() + "\n")
}

// Handle the outputs
for _, output := range block.GetOutputs() {
for _, oi := range output.Items {
sb.WriteString("```" + OUTPUTLANG + "\n")
sb.WriteString(oi.GetTextData())
sb.WriteString("\n```\n")
}
}

return sb.String()
}

// MarkdownToBlocks converts a markdown string into a sequence of blocks.
// This function relies on the goldmark library to parse the markdown into an AST.
func MarkdownToBlocks(mdText string) ([]*v1alpha1.Block, error) {
gm := goldmark.New()
source := []byte(mdText)
reader := text.NewReader(source)
root := gm.Parser().Parse(reader)

renderer := markdownfmt.NewRenderer()

blocks := make([]*v1alpha1.Block, 0, 20)

err := ast.Walk(root, func(node ast.Node, entering bool) (ast.WalkStatus, error) {
if !entering {
// Do nothing on leaving the block; just continue the walk
return ast.WalkContinue, nil
}

if node.Kind() == ast.KindDocument {
// Ignore the document node
return ast.WalkContinue, nil
}

if node.Kind() != ast.KindFencedCodeBlock {
// Since we aren't in a code block render the node and its children to markdown
// so we can add them as a block
var sb strings.Builder
if err := renderer.Render(&sb, source, node); err != nil {
return ast.WalkStop, err
}
newBlock := &v1alpha1.Block{
Kind: v1alpha1.BlockKind_MARKUP,
Contents: sb.String(),
}
blocks = append(blocks, newBlock)
// Skip the children because we've already rendered the children to markdown so there's no need
// to visit the children nodes
return ast.WalkSkipChildren, nil

}

// Since we encountered a fenced code block we need to extract the code block
fenced := node.(*ast.FencedCodeBlock)
lang := string(fenced.Language(source))
textData := getBlockText(fenced, source)

lastBlock := len(blocks) - 1
lastWasCode := false
if lastBlock >= 0 && blocks[lastBlock].Kind == v1alpha1.BlockKind_CODE {
lastWasCode = true
}

if lang == OUTPUTLANG && lastWasCode {
// Since its an output block and the last block was a code block we should append the output to the last block
if blocks[lastBlock].Outputs == nil {
blocks[lastBlock].Outputs = make([]*v1alpha1.BlockOutput, 0, 1)
}
blocks[lastBlock].Outputs = append(blocks[lastBlock].Outputs, &v1alpha1.BlockOutput{
Items: []*v1alpha1.BlockOutputItem{
{
TextData: textData,
},
},
})
} else {
block := &v1alpha1.Block{
Kind: v1alpha1.BlockKind_CODE,
Contents: textData,
Language: lang,
}
blocks = append(blocks, block)
}

// We can skip walking the children of the code block since we've already ingested the code block
return ast.WalkSkipChildren, nil
})

// The way we walk the AST above we potentially end up segmenting continuous markdown without code blocks
// into more than one block. So we merge these blocks.
final := make([]*v1alpha1.Block, 0, len(blocks))
i := 0
for _, block := range blocks {
lastBlock := i - 1
addToLastBlock := false
if lastBlock >= 0 && block.Kind == v1alpha1.BlockKind_MARKUP && final[lastBlock].Kind == v1alpha1.BlockKind_MARKUP {
addToLastBlock = true
}

if addToLastBlock {
final[lastBlock].Contents += block.Contents
} else {
final = append(final, block)
i++
}
}

return final, err
}

func getBlockText(fenced *ast.FencedCodeBlock, source []byte) string {
var sb strings.Builder
for i := 0; i < fenced.Lines().Len(); i++ {
// Get the i'th line
line := fenced.Lines().At(i)
sb.WriteString(string(line.Value(source)))
}
return sb.String()
}
136 changes: 136 additions & 0 deletions app/pkg/docs/converters_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
package docs

import (
"os"
"path/filepath"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/jlewi/foyle/app/pkg/testutil"
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
)

func Test_BlockToMarkdown(t *testing.T) {
type testCase struct {
name string
block *v1alpha1.Block
expected string
}

testCases := []testCase{
{
name: "markup",
block: &v1alpha1.Block{
Kind: v1alpha1.BlockKind_MARKUP,
Contents: "This is a test",
},
expected: "This is a test\n",
},
{
name: "code",
block: &v1alpha1.Block{
Kind: v1alpha1.BlockKind_CODE,
Contents: "echo \"something something\"",
Outputs: []*v1alpha1.BlockOutput{
{
Items: []*v1alpha1.BlockOutputItem{
{
TextData: "something something",
},
},
},
},
},
expected: "```bash\necho \"something something\"\n```\n```output\nsomething something\n```\n",
},
}
for _, c := range testCases {
t.Run(c.name, func(t *testing.T) {
actual := BlockToMarkdown(c.block)
if d := cmp.Diff(c.expected, actual); d != "" {
t.Errorf("Unexpected diff:\n%s", d)
}
})
}
}

func Test_MarkdownToBlocks(t *testing.T) {
type testCase struct {
name string
inFile string
expected []*v1alpha1.Block
}

cases := []testCase{
{
name: "simple",
inFile: "testdoc.md",
expected: []*v1alpha1.Block{
{
Kind: v1alpha1.BlockKind_MARKUP,
Contents: "# Section 1\n\nThis is section 1",
},
{
Kind: v1alpha1.BlockKind_CODE,
Language: "go",
Contents: "package main\n\nfunc main() {\n...\n}\n",
},
{
Kind: v1alpha1.BlockKind_MARKUP,
Contents: "\n\nBreaking text",
},
{
Kind: v1alpha1.BlockKind_CODE,
Language: "bash",
Contents: "echo \"Hello, World!\"\n",
Outputs: []*v1alpha1.BlockOutput{
{
Items: []*v1alpha1.BlockOutputItem{
{
TextData: "hello, world!\n",
}},
},
},
},
{
Kind: v1alpha1.BlockKind_MARKUP,
Contents: "\n\n## Subsection",
},
},
},
}

cwd, err := os.Getwd()
if err != nil {
t.Fatalf("Failed to get working directory: %v", err)
}

for _, c := range cases {
t.Run(c.name, func(t *testing.T) {
fPath := filepath.Join(cwd, "test_data", c.inFile)
raw, err := os.ReadFile(fPath)
if err != nil {
t.Fatalf("Failed to read raw file: %v", err)
}
actual, err := MarkdownToBlocks(string(raw))
if err != nil {
t.Fatalf("MarkdownToBlocks(%v) returned error %v", c.inFile, err)
}
if len(actual) != len(c.expected) {
t.Errorf("Expected %v blocks got %v", len(c.expected), len(actual))
}

for i, eBlock := range c.expected {
if i >= len(actual) {
break
}

aBlock := actual[i]

if d := cmp.Diff(eBlock, aBlock, testutil.BlockComparer); d != "" {
t.Errorf("Unexpected diff block %d:\n%s", i, d)
}
}
})
}
}
2 changes: 2 additions & 0 deletions app/pkg/docs/docs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
// Package docs contains routines for working with documents.
package docs
Loading

0 comments on commit b9194cd

Please sign in to comment.