-
Notifications
You must be signed in to change notification settings - Fork 1
/
markParagraphs.pl
executable file
·78 lines (63 loc) · 2.16 KB
/
markParagraphs.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/perl -w
use strict;
use FileHandle;
my $DEBUG = 0;
if( $ARGV[0] eq '-d' ) {
$DEBUG = 1;
shift( @ARGV );
}
my $SOURCEFILE = $ARGV[0];
my $OUTFILE = $ARGV[1];
if( ! $SOURCEFILE ) {
die "ERROR: Must supply the output file name from Scrivener.\n\nUsage: splitHTMLBook.pl compiled-book.txt\n";
}
if( ! -f $SOURCEFILE ) {
die "ERROR: Unable to read file: $SOURCEFILE\n";
}
print 'Source: ' . $SOURCEFILE . "\n";
print 'Output: ' . $OUTFILE . "\n";
if( -f $OUTFILE ) {
print "WARNING: Overwriting existing temp file: $OUTFILE\n";
}
my $fulltext;
open( INPUT, "<${SOURCEFILE}" )
or die "Unable to read sourcefile: ${SOURCEFILE}\n";
while( <INPUT> ) {
$fulltext .= $_;
}
close( INPUT )
or die;
# Add linefeeds to the end (Scrivener strips it) for paragraph identification purposes
$fulltext .= "\n\n\n";
# Prevent adding paragraphs to pre blocks
$fulltext =~ s|(<pre [^>]*?>)(.*?)(</pre>)|$1 . &addBreak( $2 ) . $3|egs;
# Add paragraphs to things inside an aside
while( $fulltext =~ s|\n\n(<aside data-type="\w+">\s*)([\w]+.*?)\s*\n\n|\n\n$1<p>$2</p>\n\n|gs ) {}
# Add paragraphs to things starting with an anchor
while( $fulltext =~ s#\n\n(<a (href|data-type)="[\w]+.*?)\s*\n\n#\n\n<p>$1</p>\n\n#gs ) {}
# Fix anchors?
# F: <a href="#([\w\-\:\.]+)">([\w\s\-\:\.]+)</a>
# R: <a data-type="xref" href='#$1'>#$1</a>
# Add paragraphs to everything that looks like a paragraph
while( $fulltext =~ s|\n\n([\w\.]+.*?)\s*\n\n|\n\n<p>$1</p>\n\n|gs ) {}
while( $fulltext =~ s|(\s*)</aside></p>|</p>$1</aside>|gs ) {}
# Add paragraphs to things that start with an internal anchor
while( $fulltext =~ s|\n\n(<a href="#[\w]+.*?)\s*\n\n|\n\n<p>$1</p>\n\n|gs ) {}
# Remove the pre-comments
$fulltext =~ s|(<pre [^>]*?>)(.*?)(</pre>)|$1 . &removeBreak( $2 ) . $3|egs;
# Output the file
my $OUTPUTFH = FileHandle->new( $OUTFILE, 'w' );
print $OUTPUTFH $fulltext;
$OUTPUTFH->close();
print "Finished parsing for paragraphs.\n" if $DEBUG;
exit 0;
sub addBreak {
my $codeblock = shift;
$codeblock =~ s|\n\n|\n<!-- PRE -->\n|gs;
return $codeblock;
}
sub removeBreak {
my $codeblock = shift;
$codeblock =~ s|\n<!-- PRE -->\n|\n\n|gs;
return $codeblock;
}