Converters to/from markdown (#31)

* Use the AST to parse markdown documents and turn them into blocks so they can potentially be rendered as notebooks. * Add some test utilities to define common comparers
jlewi · Apr 8, 2024 · b9194cd · b9194cd
1 parent 69bd015
commit b9194cd
Show file tree

Hide file tree

Showing 9 changed files with 346 additions and 4 deletions.
diff --git a/app/go.mod b/app/go.mod
@@ -5,10 +5,13 @@ go 1.22.1
 replace github.com/jlewi/foyle/protos/go => ../protos/go
 
 require (
+	github.com/Kunde21/markdownfmt/v3 v3.1.0
 	github.com/gin-contrib/cors v1.7.1
 	github.com/gin-gonic/gin v1.9.1
+	github.com/go-cmd/cmd v1.4.1
 	github.com/go-logr/logr v1.3.0
 	github.com/go-logr/zapr v1.3.0
+	github.com/google/go-cmp v0.6.0
 	github.com/grpc-ecosystem/grpc-gateway/v2 v2.19.1
 	github.com/jlewi/foyle/protos/go v0.0.0-00010101000000-000000000000
 	github.com/jlewi/hydros v0.0.6
@@ -17,6 +20,7 @@ require (
 	github.com/spf13/cobra v1.8.0
 	github.com/spf13/viper v1.18.2
 	github.com/timtadh/lexmachine v0.2.3
+	github.com/yuin/goldmark v1.4.13
 	go.opentelemetry.io/contrib/instrumentation/google.golang.org/grpc/otelgrpc v0.46.1
 	go.uber.org/zap v1.27.0
 	google.golang.org/grpc v1.62.1
@@ -55,7 +59,6 @@ require (
 	github.com/gabriel-vasile/mimetype v1.4.3 // indirect
 	github.com/ghodss/yaml v1.0.0 // indirect
 	github.com/gin-contrib/sse v0.1.0 // indirect
-	github.com/go-cmd/cmd v1.4.1 // indirect
 	github.com/go-errors/errors v1.0.1 // indirect
 	github.com/go-git/gcfg v1.5.0 // indirect
 	github.com/go-git/go-billy/v5 v5.4.1 // indirect
@@ -72,7 +75,6 @@ require (
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
 	github.com/golang/protobuf v1.5.3 // indirect
 	github.com/google/gnostic v0.6.9 // indirect
-	github.com/google/go-cmp v0.6.0 // indirect
 	github.com/google/go-containerregistry v0.18.0 // indirect
 	github.com/google/gofuzz v1.2.0 // indirect
 	github.com/google/s2a-go v0.1.7 // indirect
@@ -93,6 +95,7 @@ require (
 	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 // indirect
 	github.com/mattn/go-isatty v0.0.20 // indirect
+	github.com/mattn/go-runewidth v0.0.13 // indirect
 	github.com/mitchellh/go-homedir v1.1.0 // indirect
 	github.com/mitchellh/mapstructure v1.5.0 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
@@ -102,6 +105,7 @@ require (
 	github.com/opencontainers/image-spec v1.1.0-rc3 // indirect
 	github.com/pelletier/go-toml/v2 v2.2.0 // indirect
 	github.com/pjbgf/sha1cd v0.3.0 // indirect
+	github.com/rivo/uniseg v0.4.2 // indirect
 	github.com/sagikazarmark/locafero v0.4.0 // indirect
 	github.com/sagikazarmark/slog-shim v0.1.0 // indirect
 	github.com/sergi/go-diff v1.2.0 // indirect

diff --git a/app/go.sum b/app/go.sum
@@ -20,6 +20,8 @@ cloud.google.com/go/storage v1.36.0 h1:P0mOkAcaJxhCTvAkMhxMfrTKiNcub4YmmPBtlhAyT
 cloud.google.com/go/storage v1.36.0/go.mod h1:M6M/3V/D3KpzMTJyPOR/HU6n2Si5QdaXYEsng2xgOs8=
 github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU=
 github.com/BurntSushi/toml v1.2.1/go.mod h1:CxXYINrC8qIiEnFrOxCa7Jy5BFHlXnUU2pbicEuybxQ=
+github.com/Kunde21/markdownfmt/v3 v3.1.0 h1:KiZu9LKs+wFFBQKhrZJrFZwtLnCCWJahL+S+E/3VnM0=
+github.com/Kunde21/markdownfmt/v3 v3.1.0/go.mod h1:tPXN1RTyOzJwhfHoon9wUr4HGYmWgVxSQN6VBJDkrVc=
 github.com/Microsoft/go-winio v0.5.2/go.mod h1:WpS1mjBmmwHBEWmogvA2mj8546UReBk4v8QkMxJ6pZY=
 github.com/Microsoft/go-winio v0.6.1 h1:9/kr64B9VUZrLm5YYwbGtUJnMgqWVOdUAXu6Migciow=
 github.com/Microsoft/go-winio v0.6.1/go.mod h1:LRdKpFKfdobln8UmuiYcKPot9D2v6svN5+sAH+4kjUM=
@@ -253,6 +255,8 @@ github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2 h1:JAEbJn3j/FrhdWA9jW8
 github.com/matryer/try v0.0.0-20161228173917-9ac251b645a2/go.mod h1:0KeJpeMD6o+O4hW7qJOT7vyQPKrWmj26uf5wMc/IiIs=
 github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY=
 github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y=
+github.com/mattn/go-runewidth v0.0.13 h1:lTGmDsbAYt5DmK6OnoV7EuIF1wEIFAcxld6ypU4OSgU=
+github.com/mattn/go-runewidth v0.0.13/go.mod h1:Jdepj2loyihRzMpdS35Xk/zdY8IAYHsh153qUoGf23w=
 github.com/mitchellh/go-homedir v1.1.0 h1:lukF9ziXFxDFPkA1vsr5zpc1XuPDn/wFntq5mG+4E0Y=
 github.com/mitchellh/go-homedir v1.1.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0=
 github.com/mitchellh/mapstructure v1.5.0 h1:jeMsZIYE/09sWLaz43PL7Gy6RuMjD2eJVyuac5Z2hdY=
@@ -280,6 +284,9 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U=
 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
 github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA=
+github.com/rivo/uniseg v0.2.0/go.mod h1:J6wj4VEh+S6ZtnVlnTBMWIodfgj8LQOQFoIToxlJtxc=
+github.com/rivo/uniseg v0.4.2 h1:YwD0ulJSJytLpiaWua0sBDusfsCZohxjxzVTYjwxfV8=
+github.com/rivo/uniseg v0.4.2/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88=
 github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ=
 github.com/rogpeppe/go-internal v1.9.0 h1:73kH8U+JUqXU8lRuOHeVHaa/SZPifC7BkcraZVejAe8=
 github.com/rogpeppe/go-internal v1.9.0/go.mod h1:WtVeX8xhTBvf0smdhujwtBcq4Qrzq/fJaraNFVN+nFs=
@@ -350,6 +357,7 @@ github.com/xlab/treeprint v1.1.0 h1:G/1DjNkPpfZCFt9CSh6b5/nY4VimlbHF3Rh4obvtzDk=
 github.com/xlab/treeprint v1.1.0/go.mod h1:gj5Gd3gPdKtR1ikdDK6fnFLdmIS0X30kTTuNd/WEJu0=
 github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
 github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74=
+github.com/yuin/goldmark v1.4.13 h1:fVcFKWvrslecOb/tg+Cc05dkeYx540o0FuFt3nUVDoE=
 github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY=
 go.opencensus.io v0.24.0 h1:y73uSU6J157QMP2kn2r30vwW1A2W2WFwSCGnAVxeaD0=
 go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo=

diff --git a/app/pkg/docs/const.go b/app/pkg/docs/const.go
@@ -0,0 +1,10 @@
+package docs
+
+const (
+	BASHLANG = "bash"
+	// OUTPUTLANG is the language to give to output code blocks.
+	// We want to potentially distinguish output from code blocks because output blocks are nested inside blocks
+	// in notebooks. Therefore if we want to be able to convert a markdown document into a document with blocks
+	// then having a unique language for output blocks helps us identify them and properly reencode them.
+	OUTPUTLANG = "output"
+)
diff --git a/app/pkg/docs/converters.go b/app/pkg/docs/converters.go
@@ -0,0 +1,147 @@
+package docs
+
+import (
+	"strings"
+
+	markdownfmt "github.com/Kunde21/markdownfmt/v3/markdown"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+	"github.com/yuin/goldmark"
+	"github.com/yuin/goldmark/ast"
+	"github.com/yuin/goldmark/text"
+)
+
+// BlockToMarkdown converts a block to markdown
+func BlockToMarkdown(block *v1alpha1.Block) string {
+	sb := strings.Builder{}
+
+	switch block.GetKind() {
+	case v1alpha1.BlockKind_CODE:
+		// Code just gets written as a code block
+		sb.WriteString("```" + BASHLANG + "\n")
+		sb.WriteString(block.GetContents())
+		sb.WriteString("\n```\n")
+	default:
+		// Otherwise assume its a markdown block
+		sb.WriteString(block.GetContents() + "\n")
+	}
+
+	// Handle the outputs
+	for _, output := range block.GetOutputs() {
+		for _, oi := range output.Items {
+			sb.WriteString("```" + OUTPUTLANG + "\n")
+			sb.WriteString(oi.GetTextData())
+			sb.WriteString("\n```\n")
+		}
+	}
+
+	return sb.String()
+}
+
+// MarkdownToBlocks converts a markdown string into a sequence of blocks.
+// This function relies on the goldmark library to parse the markdown into an AST.
+func MarkdownToBlocks(mdText string) ([]*v1alpha1.Block, error) {
+	gm := goldmark.New()
+	source := []byte(mdText)
+	reader := text.NewReader(source)
+	root := gm.Parser().Parse(reader)
+
+	renderer := markdownfmt.NewRenderer()
+
+	blocks := make([]*v1alpha1.Block, 0, 20)
+
+	err := ast.Walk(root, func(node ast.Node, entering bool) (ast.WalkStatus, error) {
+		if !entering {
+			// Do nothing on leaving the block; just continue the walk
+			return ast.WalkContinue, nil
+		}
+
+		if node.Kind() == ast.KindDocument {
+			// Ignore the document node
+			return ast.WalkContinue, nil
+		}
+
+		if node.Kind() != ast.KindFencedCodeBlock {
+			// Since we aren't in a code block render the node and its children to markdown
+			// so we can add them as a block
+			var sb strings.Builder
+			if err := renderer.Render(&sb, source, node); err != nil {
+				return ast.WalkStop, err
+			}
+			newBlock := &v1alpha1.Block{
+				Kind:     v1alpha1.BlockKind_MARKUP,
+				Contents: sb.String(),
+			}
+			blocks = append(blocks, newBlock)
+			// Skip the children because we've already rendered the children to markdown so there's no need
+			// to visit the children nodes
+			return ast.WalkSkipChildren, nil
+
+		}
+
+		// Since we encountered a fenced code block we need to extract the code block
+		fenced := node.(*ast.FencedCodeBlock)
+		lang := string(fenced.Language(source))
+		textData := getBlockText(fenced, source)
+
+		lastBlock := len(blocks) - 1
+		lastWasCode := false
+		if lastBlock >= 0 && blocks[lastBlock].Kind == v1alpha1.BlockKind_CODE {
+			lastWasCode = true
+		}
+
+		if lang == OUTPUTLANG && lastWasCode {
+			// Since its an output block and the last block was a code block we should append the output to the last block
+			if blocks[lastBlock].Outputs == nil {
+				blocks[lastBlock].Outputs = make([]*v1alpha1.BlockOutput, 0, 1)
+			}
+			blocks[lastBlock].Outputs = append(blocks[lastBlock].Outputs, &v1alpha1.BlockOutput{
+				Items: []*v1alpha1.BlockOutputItem{
+					{
+						TextData: textData,
+					},
+				},
+			})
+		} else {
+			block := &v1alpha1.Block{
+				Kind:     v1alpha1.BlockKind_CODE,
+				Contents: textData,
+				Language: lang,
+			}
+			blocks = append(blocks, block)
+		}
+
+		// We can skip walking the children of the code block since we've already ingested the code block
+		return ast.WalkSkipChildren, nil
+	})
+
+	// The way we walk the AST above we potentially end up segmenting continuous markdown without code blocks
+	// into more than one block. So we merge these blocks.
+	final := make([]*v1alpha1.Block, 0, len(blocks))
+	i := 0
+	for _, block := range blocks {
+		lastBlock := i - 1
+		addToLastBlock := false
+		if lastBlock >= 0 && block.Kind == v1alpha1.BlockKind_MARKUP && final[lastBlock].Kind == v1alpha1.BlockKind_MARKUP {
+			addToLastBlock = true
+		}
+
+		if addToLastBlock {
+			final[lastBlock].Contents += block.Contents
+		} else {
+			final = append(final, block)
+			i++
+		}
+	}
+
+	return final, err
+}
+
+func getBlockText(fenced *ast.FencedCodeBlock, source []byte) string {
+	var sb strings.Builder
+	for i := 0; i < fenced.Lines().Len(); i++ {
+		// Get the i'th line
+		line := fenced.Lines().At(i)
+		sb.WriteString(string(line.Value(source)))
+	}
+	return sb.String()
+}
diff --git a/app/pkg/docs/converters_test.go b/app/pkg/docs/converters_test.go
@@ -0,0 +1,136 @@
+package docs
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"github.com/google/go-cmp/cmp"
+	"github.com/jlewi/foyle/app/pkg/testutil"
+	"github.com/jlewi/foyle/protos/go/foyle/v1alpha1"
+)
+
+func Test_BlockToMarkdown(t *testing.T) {
+	type testCase struct {
+		name     string
+		block    *v1alpha1.Block
+		expected string
+	}
+
+	testCases := []testCase{
+		{
+			name: "markup",
+			block: &v1alpha1.Block{
+				Kind:     v1alpha1.BlockKind_MARKUP,
+				Contents: "This is a test",
+			},
+			expected: "This is a test\n",
+		},
+		{
+			name: "code",
+			block: &v1alpha1.Block{
+				Kind:     v1alpha1.BlockKind_CODE,
+				Contents: "echo \"something something\"",
+				Outputs: []*v1alpha1.BlockOutput{
+					{
+						Items: []*v1alpha1.BlockOutputItem{
+							{
+								TextData: "something something",
+							},
+						},
+					},
+				},
+			},
+			expected: "```bash\necho \"something something\"\n```\n```output\nsomething something\n```\n",
+		},
+	}
+	for _, c := range testCases {
+		t.Run(c.name, func(t *testing.T) {
+			actual := BlockToMarkdown(c.block)
+			if d := cmp.Diff(c.expected, actual); d != "" {
+				t.Errorf("Unexpected diff:\n%s", d)
+			}
+		})
+	}
+}
+
+func Test_MarkdownToBlocks(t *testing.T) {
+	type testCase struct {
+		name     string
+		inFile   string
+		expected []*v1alpha1.Block
+	}
+
+	cases := []testCase{
+		{
+			name:   "simple",
+			inFile: "testdoc.md",
+			expected: []*v1alpha1.Block{
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "# Section 1\n\nThis is section 1",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_CODE,
+					Language: "go",
+					Contents: "package main\n\nfunc main() {\n...\n}\n",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "\n\nBreaking text",
+				},
+				{
+					Kind:     v1alpha1.BlockKind_CODE,
+					Language: "bash",
+					Contents: "echo \"Hello, World!\"\n",
+					Outputs: []*v1alpha1.BlockOutput{
+						{
+							Items: []*v1alpha1.BlockOutputItem{
+								{
+									TextData: "hello, world!\n",
+								}},
+						},
+					},
+				},
+				{
+					Kind:     v1alpha1.BlockKind_MARKUP,
+					Contents: "\n\n## Subsection",
+				},
+			},
+		},
+	}
+
+	cwd, err := os.Getwd()
+	if err != nil {
+		t.Fatalf("Failed to get working directory: %v", err)
+	}
+
+	for _, c := range cases {
+		t.Run(c.name, func(t *testing.T) {
+			fPath := filepath.Join(cwd, "test_data", c.inFile)
+			raw, err := os.ReadFile(fPath)
+			if err != nil {
+				t.Fatalf("Failed to read raw file: %v", err)
+			}
+			actual, err := MarkdownToBlocks(string(raw))
+			if err != nil {
+				t.Fatalf("MarkdownToBlocks(%v) returned error %v", c.inFile, err)
+			}
+			if len(actual) != len(c.expected) {
+				t.Errorf("Expected %v blocks got %v", len(c.expected), len(actual))
+			}
+
+			for i, eBlock := range c.expected {
+				if i >= len(actual) {
+					break
+				}
+
+				aBlock := actual[i]
+
+				if d := cmp.Diff(eBlock, aBlock, testutil.BlockComparer); d != "" {
+					t.Errorf("Unexpected diff block %d:\n%s", i, d)
+				}
+			}
+		})
+	}
+}
diff --git a/app/pkg/docs/docs.go b/app/pkg/docs/docs.go
@@ -0,0 +1,2 @@
+// Package docs contains routines for working with documents.
+package docs
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		// Package docs contains routines for working with documents.
		package docs