-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Use the AST to parse markdown documents and turn them into blocks so they can potentially be rendered as notebooks. * Add some test utilities to define common comparers
- Loading branch information
Showing
9 changed files
with
346 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
package docs | ||
|
||
const ( | ||
BASHLANG = "bash" | ||
// OUTPUTLANG is the language to give to output code blocks. | ||
// We want to potentially distinguish output from code blocks because output blocks are nested inside blocks | ||
// in notebooks. Therefore if we want to be able to convert a markdown document into a document with blocks | ||
// then having a unique language for output blocks helps us identify them and properly reencode them. | ||
OUTPUTLANG = "output" | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,147 @@ | ||
package docs | ||
|
||
import ( | ||
"strings" | ||
|
||
markdownfmt "github.com/Kunde21/markdownfmt/v3/markdown" | ||
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1" | ||
"github.com/yuin/goldmark" | ||
"github.com/yuin/goldmark/ast" | ||
"github.com/yuin/goldmark/text" | ||
) | ||
|
||
// BlockToMarkdown converts a block to markdown | ||
func BlockToMarkdown(block *v1alpha1.Block) string { | ||
sb := strings.Builder{} | ||
|
||
switch block.GetKind() { | ||
case v1alpha1.BlockKind_CODE: | ||
// Code just gets written as a code block | ||
sb.WriteString("```" + BASHLANG + "\n") | ||
sb.WriteString(block.GetContents()) | ||
sb.WriteString("\n```\n") | ||
default: | ||
// Otherwise assume its a markdown block | ||
sb.WriteString(block.GetContents() + "\n") | ||
} | ||
|
||
// Handle the outputs | ||
for _, output := range block.GetOutputs() { | ||
for _, oi := range output.Items { | ||
sb.WriteString("```" + OUTPUTLANG + "\n") | ||
sb.WriteString(oi.GetTextData()) | ||
sb.WriteString("\n```\n") | ||
} | ||
} | ||
|
||
return sb.String() | ||
} | ||
|
||
// MarkdownToBlocks converts a markdown string into a sequence of blocks. | ||
// This function relies on the goldmark library to parse the markdown into an AST. | ||
func MarkdownToBlocks(mdText string) ([]*v1alpha1.Block, error) { | ||
gm := goldmark.New() | ||
source := []byte(mdText) | ||
reader := text.NewReader(source) | ||
root := gm.Parser().Parse(reader) | ||
|
||
renderer := markdownfmt.NewRenderer() | ||
|
||
blocks := make([]*v1alpha1.Block, 0, 20) | ||
|
||
err := ast.Walk(root, func(node ast.Node, entering bool) (ast.WalkStatus, error) { | ||
if !entering { | ||
// Do nothing on leaving the block; just continue the walk | ||
return ast.WalkContinue, nil | ||
} | ||
|
||
if node.Kind() == ast.KindDocument { | ||
// Ignore the document node | ||
return ast.WalkContinue, nil | ||
} | ||
|
||
if node.Kind() != ast.KindFencedCodeBlock { | ||
// Since we aren't in a code block render the node and its children to markdown | ||
// so we can add them as a block | ||
var sb strings.Builder | ||
if err := renderer.Render(&sb, source, node); err != nil { | ||
return ast.WalkStop, err | ||
} | ||
newBlock := &v1alpha1.Block{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
Contents: sb.String(), | ||
} | ||
blocks = append(blocks, newBlock) | ||
// Skip the children because we've already rendered the children to markdown so there's no need | ||
// to visit the children nodes | ||
return ast.WalkSkipChildren, nil | ||
|
||
} | ||
|
||
// Since we encountered a fenced code block we need to extract the code block | ||
fenced := node.(*ast.FencedCodeBlock) | ||
lang := string(fenced.Language(source)) | ||
textData := getBlockText(fenced, source) | ||
|
||
lastBlock := len(blocks) - 1 | ||
lastWasCode := false | ||
if lastBlock >= 0 && blocks[lastBlock].Kind == v1alpha1.BlockKind_CODE { | ||
lastWasCode = true | ||
} | ||
|
||
if lang == OUTPUTLANG && lastWasCode { | ||
// Since its an output block and the last block was a code block we should append the output to the last block | ||
if blocks[lastBlock].Outputs == nil { | ||
blocks[lastBlock].Outputs = make([]*v1alpha1.BlockOutput, 0, 1) | ||
} | ||
blocks[lastBlock].Outputs = append(blocks[lastBlock].Outputs, &v1alpha1.BlockOutput{ | ||
Items: []*v1alpha1.BlockOutputItem{ | ||
{ | ||
TextData: textData, | ||
}, | ||
}, | ||
}) | ||
} else { | ||
block := &v1alpha1.Block{ | ||
Kind: v1alpha1.BlockKind_CODE, | ||
Contents: textData, | ||
Language: lang, | ||
} | ||
blocks = append(blocks, block) | ||
} | ||
|
||
// We can skip walking the children of the code block since we've already ingested the code block | ||
return ast.WalkSkipChildren, nil | ||
}) | ||
|
||
// The way we walk the AST above we potentially end up segmenting continuous markdown without code blocks | ||
// into more than one block. So we merge these blocks. | ||
final := make([]*v1alpha1.Block, 0, len(blocks)) | ||
i := 0 | ||
for _, block := range blocks { | ||
lastBlock := i - 1 | ||
addToLastBlock := false | ||
if lastBlock >= 0 && block.Kind == v1alpha1.BlockKind_MARKUP && final[lastBlock].Kind == v1alpha1.BlockKind_MARKUP { | ||
addToLastBlock = true | ||
} | ||
|
||
if addToLastBlock { | ||
final[lastBlock].Contents += block.Contents | ||
} else { | ||
final = append(final, block) | ||
i++ | ||
} | ||
} | ||
|
||
return final, err | ||
} | ||
|
||
func getBlockText(fenced *ast.FencedCodeBlock, source []byte) string { | ||
var sb strings.Builder | ||
for i := 0; i < fenced.Lines().Len(); i++ { | ||
// Get the i'th line | ||
line := fenced.Lines().At(i) | ||
sb.WriteString(string(line.Value(source))) | ||
} | ||
return sb.String() | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,136 @@ | ||
package docs | ||
|
||
import ( | ||
"os" | ||
"path/filepath" | ||
"testing" | ||
|
||
"github.com/google/go-cmp/cmp" | ||
"github.com/jlewi/foyle/app/pkg/testutil" | ||
"github.com/jlewi/foyle/protos/go/foyle/v1alpha1" | ||
) | ||
|
||
func Test_BlockToMarkdown(t *testing.T) { | ||
type testCase struct { | ||
name string | ||
block *v1alpha1.Block | ||
expected string | ||
} | ||
|
||
testCases := []testCase{ | ||
{ | ||
name: "markup", | ||
block: &v1alpha1.Block{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
Contents: "This is a test", | ||
}, | ||
expected: "This is a test\n", | ||
}, | ||
{ | ||
name: "code", | ||
block: &v1alpha1.Block{ | ||
Kind: v1alpha1.BlockKind_CODE, | ||
Contents: "echo \"something something\"", | ||
Outputs: []*v1alpha1.BlockOutput{ | ||
{ | ||
Items: []*v1alpha1.BlockOutputItem{ | ||
{ | ||
TextData: "something something", | ||
}, | ||
}, | ||
}, | ||
}, | ||
}, | ||
expected: "```bash\necho \"something something\"\n```\n```output\nsomething something\n```\n", | ||
}, | ||
} | ||
for _, c := range testCases { | ||
t.Run(c.name, func(t *testing.T) { | ||
actual := BlockToMarkdown(c.block) | ||
if d := cmp.Diff(c.expected, actual); d != "" { | ||
t.Errorf("Unexpected diff:\n%s", d) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func Test_MarkdownToBlocks(t *testing.T) { | ||
type testCase struct { | ||
name string | ||
inFile string | ||
expected []*v1alpha1.Block | ||
} | ||
|
||
cases := []testCase{ | ||
{ | ||
name: "simple", | ||
inFile: "testdoc.md", | ||
expected: []*v1alpha1.Block{ | ||
{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
Contents: "# Section 1\n\nThis is section 1", | ||
}, | ||
{ | ||
Kind: v1alpha1.BlockKind_CODE, | ||
Language: "go", | ||
Contents: "package main\n\nfunc main() {\n...\n}\n", | ||
}, | ||
{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
Contents: "\n\nBreaking text", | ||
}, | ||
{ | ||
Kind: v1alpha1.BlockKind_CODE, | ||
Language: "bash", | ||
Contents: "echo \"Hello, World!\"\n", | ||
Outputs: []*v1alpha1.BlockOutput{ | ||
{ | ||
Items: []*v1alpha1.BlockOutputItem{ | ||
{ | ||
TextData: "hello, world!\n", | ||
}}, | ||
}, | ||
}, | ||
}, | ||
{ | ||
Kind: v1alpha1.BlockKind_MARKUP, | ||
Contents: "\n\n## Subsection", | ||
}, | ||
}, | ||
}, | ||
} | ||
|
||
cwd, err := os.Getwd() | ||
if err != nil { | ||
t.Fatalf("Failed to get working directory: %v", err) | ||
} | ||
|
||
for _, c := range cases { | ||
t.Run(c.name, func(t *testing.T) { | ||
fPath := filepath.Join(cwd, "test_data", c.inFile) | ||
raw, err := os.ReadFile(fPath) | ||
if err != nil { | ||
t.Fatalf("Failed to read raw file: %v", err) | ||
} | ||
actual, err := MarkdownToBlocks(string(raw)) | ||
if err != nil { | ||
t.Fatalf("MarkdownToBlocks(%v) returned error %v", c.inFile, err) | ||
} | ||
if len(actual) != len(c.expected) { | ||
t.Errorf("Expected %v blocks got %v", len(c.expected), len(actual)) | ||
} | ||
|
||
for i, eBlock := range c.expected { | ||
if i >= len(actual) { | ||
break | ||
} | ||
|
||
aBlock := actual[i] | ||
|
||
if d := cmp.Diff(eBlock, aBlock, testutil.BlockComparer); d != "" { | ||
t.Errorf("Unexpected diff block %d:\n%s", i, d) | ||
} | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
// Package docs contains routines for working with documents. | ||
package docs |
Oops, something went wrong.