inline.go 20 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781
  1. package md
  2. import (
  3. "regexp"
  4. "strings"
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. // InlineOp represents an inline operation.
  9. type InlineOp struct {
  10. Type InlineOpType
  11. // OpText, OpCodeSpan, OpRawHTML, OpAutolink: Text content
  12. // OpLinkStart, OpLinkEnd, OpImage: title text
  13. Text string
  14. // OpLinkStart, OpLinkEnd, OpImage, OpAutolink
  15. Dest string
  16. // ForOpImage
  17. Alt string
  18. }
  19. // InlineOpType enumerates possible types of an InlineOp.
  20. type InlineOpType uint
  21. const (
  22. // Text elements. Embedded newlines in OpText are turned into OpNewLine, but
  23. // OpRawHTML can contain embedded newlines. OpCodeSpan never contains
  24. // embedded newlines.
  25. OpText InlineOpType = iota
  26. OpCodeSpan
  27. OpRawHTML
  28. OpNewLine
  29. // Inline markup elements.
  30. OpEmphasisStart
  31. OpEmphasisEnd
  32. OpStrongEmphasisStart
  33. OpStrongEmphasisEnd
  34. OpLinkStart
  35. OpLinkEnd
  36. OpImage
  37. OpAutolink
  38. OpHardLineBreak
  39. )
  40. // String returns the text content of the InlineOp
  41. func (op InlineOp) String() string {
  42. switch op.Type {
  43. case OpText, OpCodeSpan, OpRawHTML, OpAutolink:
  44. return op.Text
  45. case OpNewLine:
  46. return "\n"
  47. case OpImage:
  48. return op.Alt
  49. }
  50. return ""
  51. }
  52. func renderInline(text string) []InlineOp {
  53. p := inlineParser{text, 0, makeDelimStack(), buffer{}}
  54. p.render()
  55. return p.buf.ops()
  56. }
  57. type inlineParser struct {
  58. text string
  59. pos int
  60. delims delimStack
  61. buf buffer
  62. }
  63. const (
  64. scheme = `[a-zA-Z][a-zA-Z0-9+.-]{1,31}`
  65. emailLocalPuncts = ".!#$%&'*+/=?^_`{|}~-"
  66. )
  67. var (
  68. // https://spec.commonmark.org/0.30/#uri-autolink
  69. uriAutolinkRegexp = regexp.MustCompile(
  70. `^<` + scheme + `:[^\x00-\x19 <>]*` + `>`)
  71. // https://spec.commonmark.org/0.30/#email-autolink
  72. emailAutolinkRegexp = regexp.MustCompile(
  73. `^<[a-zA-Z0-9` + emailLocalPuncts + `]+@[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?(?:\.[a-zA-Z0-9](?:[a-zA-Z0-9-]{0,61}[a-zA-Z0-9])?)*>`)
  74. openTagRegexp = regexp.MustCompile(`^` + openTag)
  75. closingTagRegexp = regexp.MustCompile(`^` + closingTag)
  76. )
  77. const (
  78. // https://spec.commonmark.org/0.30/#open-tag
  79. openTag = `<` +
  80. `[a-zA-Z][a-zA-Z0-9-]*` + // tag name
  81. (`(?:` +
  82. `[ \t\n]+` + // whitespace
  83. `[a-zA-Z_:][a-zA-Z0-9_\.:-]*` + // attribute name
  84. `(?:[ \t\n]*=[ \t\n]*(?:[^ \t\n"'=<>` + "`" + `]+|'[^']*'|"[^"]*"))?` + // attribute value specification
  85. `)*`) + // zero or more attributes
  86. `[ \t\n]*` + // whitespace
  87. `/?>`
  88. // https://spec.commonmark.org/0.30/#closing-tag
  89. closingTag = `</[a-zA-Z][a-zA-Z0-9-]*[ \t\n]*>`
  90. )
  91. func (p *inlineParser) render() {
  92. for p.pos < len(p.text) {
  93. b := p.text[p.pos]
  94. begin := p.pos
  95. p.pos++
  96. parseText := func() {
  97. for p.pos < len(p.text) && !isMeta(p.text[p.pos]) {
  98. p.pos++
  99. }
  100. text := p.text[begin:p.pos]
  101. hardLineBreak := false
  102. if p.pos < len(p.text) && p.text[p.pos] == '\n' {
  103. // https://spec.commonmark.org/0.30/#hard-line-break
  104. //
  105. // The input to renderInline never ends in a newline, so all
  106. // newlines are internal ones, thus subject to the hard line
  107. // break rules
  108. hardLineBreak = strings.HasSuffix(text, " ")
  109. text = strings.TrimRight(text, " ")
  110. }
  111. p.buf.push(textPiece(text))
  112. if hardLineBreak {
  113. p.buf.push(piece{main: InlineOp{Type: OpHardLineBreak}})
  114. }
  115. }
  116. switch b {
  117. // The 3 branches below implement the first part of
  118. // https://spec.commonmark.org/0.30/#an-algorithm-for-parsing-nested-emphasis-and-links.
  119. case '[':
  120. bufIdx := p.buf.push(textPiece("["))
  121. p.delims.push(&delim{typ: '[', bufIdx: bufIdx})
  122. case '!':
  123. if p.pos < len(p.text) && p.text[p.pos] == '[' {
  124. p.pos++
  125. bufIdx := p.buf.push(textPiece("!["))
  126. p.delims.push(&delim{typ: '!', bufIdx: bufIdx})
  127. } else {
  128. parseText()
  129. }
  130. case '*', '_':
  131. p.consumeRun(b)
  132. canOpen, canClose := canOpenCloseEmphasis(rune(b),
  133. emptyToNewline(utf8.DecodeLastRuneInString(p.text[:begin])),
  134. emptyToNewline(utf8.DecodeRuneInString(p.text[p.pos:])))
  135. bufIdx := p.buf.push(textPiece(p.text[begin:p.pos]))
  136. p.delims.push(
  137. &delim{typ: b, bufIdx: bufIdx,
  138. n: p.pos - begin, canOpen: canOpen, canClose: canClose})
  139. case ']':
  140. // https://spec.commonmark.org/0.30/#look-for-link-or-image.
  141. var opener *delim
  142. for d := p.delims.top.prev; d != p.delims.bottom; d = d.prev {
  143. if d.typ == '[' || d.typ == '!' {
  144. opener = d
  145. break
  146. }
  147. }
  148. if opener == nil || opener.inactive {
  149. if opener != nil {
  150. unlink(opener)
  151. }
  152. p.buf.push(textPiece("]"))
  153. continue
  154. }
  155. n, dest, title := parseLinkTail(p.text[p.pos:])
  156. if n == -1 {
  157. unlink(opener)
  158. p.buf.push(textPiece("]"))
  159. continue
  160. }
  161. p.pos += n
  162. p.processEmphasis(opener)
  163. if opener.typ == '[' {
  164. for d := opener.prev; d != p.delims.bottom; d = d.prev {
  165. if d.typ == '[' {
  166. d.inactive = true
  167. }
  168. }
  169. }
  170. unlink(opener)
  171. if opener.typ == '[' {
  172. p.buf.pieces[opener.bufIdx] = piece{
  173. before: []InlineOp{{Type: OpLinkStart, Dest: dest, Text: title}}}
  174. p.buf.push(piece{
  175. after: []InlineOp{{Type: OpLinkEnd, Dest: dest, Text: title}}})
  176. } else {
  177. // Use the pieces after "![" to build the image alt text.
  178. var altBuilder strings.Builder
  179. for _, piece := range p.buf.pieces[opener.bufIdx+1:] {
  180. altBuilder.WriteString(piece.main.String())
  181. }
  182. p.buf.pieces = p.buf.pieces[:opener.bufIdx]
  183. alt := altBuilder.String()
  184. p.buf.push(piece{
  185. main: InlineOp{Type: OpImage, Dest: dest, Alt: alt, Text: title}})
  186. }
  187. case '`':
  188. // https://spec.commonmark.org/0.30/#code-spans
  189. p.consumeRun('`')
  190. closer := findBacktickRun(p.text, p.text[begin:p.pos], p.pos)
  191. if closer == -1 {
  192. // No matching closer, don't parse as code span.
  193. parseText()
  194. continue
  195. }
  196. p.buf.push(piece{
  197. main: InlineOp{Type: OpCodeSpan,
  198. Text: normalizeCodeSpanContent(p.text[p.pos:closer])}})
  199. p.pos = closer + (p.pos - begin)
  200. case '<':
  201. // https://spec.commonmark.org/0.30/#raw-html
  202. if p.pos == len(p.text) {
  203. parseText()
  204. continue
  205. }
  206. parseWithRegexp := func(pattern *regexp.Regexp) bool {
  207. html := pattern.FindString(p.text[begin:])
  208. if html == "" {
  209. return false
  210. }
  211. p.buf.push(htmlPiece(html))
  212. p.pos = begin + len(html)
  213. return true
  214. }
  215. parseWithCloser := func(closer string) bool {
  216. i := strings.Index(p.text[p.pos:], closer)
  217. if i == -1 {
  218. return false
  219. }
  220. p.pos += i + len(closer)
  221. p.buf.push(htmlPiece(p.text[begin:p.pos]))
  222. return true
  223. }
  224. switch p.text[p.pos] {
  225. case '!':
  226. switch {
  227. case strings.HasPrefix(p.text[p.pos:], "!--"):
  228. // Try parsing a comment.
  229. if parseWithCloser("-->") {
  230. continue
  231. }
  232. case strings.HasPrefix(p.text[p.pos:], "![CDATA["):
  233. // Try parsing a CDATA section
  234. if parseWithCloser("]]>") {
  235. continue
  236. }
  237. case p.pos+1 < len(p.text) && isASCIILetter(p.text[p.pos+1]):
  238. // Try parsing a declaration.
  239. if parseWithCloser(">") {
  240. continue
  241. }
  242. }
  243. case '?':
  244. // Try parsing a processing instruction.
  245. closer := strings.Index(p.text[p.pos:], "?>")
  246. if closer != -1 {
  247. p.buf.push(htmlPiece(p.text[begin : p.pos+closer+2]))
  248. p.pos += closer + 2
  249. continue
  250. }
  251. case '/':
  252. // Try parsing a closing tag.
  253. if parseWithRegexp(closingTagRegexp) {
  254. continue
  255. }
  256. default:
  257. // Try parsing a open tag.
  258. if parseWithRegexp(openTagRegexp) {
  259. continue
  260. } else {
  261. // Try parsing an autolink.
  262. autolink := uriAutolinkRegexp.FindString(p.text[begin:])
  263. email := false
  264. if autolink == "" {
  265. autolink = emailAutolinkRegexp.FindString(p.text[begin:])
  266. email = true
  267. }
  268. if autolink != "" {
  269. p.pos = begin + len(autolink)
  270. // Autolinks support character references but not
  271. // backslashes, so UnescapeHTML gives us the desired
  272. // behavior.
  273. text := UnescapeHTML(autolink[1 : len(autolink)-1])
  274. dest := text
  275. if email {
  276. dest = "mailto:" + dest
  277. }
  278. p.buf.push(piece{
  279. main: InlineOp{Type: OpAutolink, Text: text, Dest: dest},
  280. })
  281. continue
  282. }
  283. }
  284. }
  285. parseText()
  286. case '&':
  287. // https://spec.commonmark.org/0.30/#entity-and-numeric-character-references
  288. if entity := leadingCharRef(p.text[begin:]); entity != "" {
  289. p.buf.push(textPiece(UnescapeHTML(entity)))
  290. p.pos = begin + len(entity)
  291. } else {
  292. parseText()
  293. }
  294. case '\\':
  295. // https://spec.commonmark.org/0.30/#backslash-escapes
  296. if p.pos < len(p.text) {
  297. if p.text[p.pos] == '\n' {
  298. // https://spec.commonmark.org/0.30/#hard-line-break
  299. //
  300. // Do *not* consume the newline; "\\\n" is a hard line break
  301. // plus a (soft) line break.
  302. p.buf.push(piece{main: InlineOp{Type: OpHardLineBreak}})
  303. continue
  304. } else if isASCIIPunct(p.text[p.pos]) {
  305. // Valid backslash escape: handle this by just discarding
  306. // the backslash. The parseText call below will consider the
  307. // next byte to be already included in the text content.
  308. begin++
  309. p.pos++
  310. }
  311. }
  312. parseText()
  313. case '\n':
  314. // Hard line breaks are already inserted using lookahead in
  315. // parseText and the case '\\' branch.
  316. p.buf.push(piece{main: InlineOp{Type: OpNewLine}})
  317. // Remove spaces at the beginning of the next line per
  318. // https://spec.commonmark.org/0.30/#soft-line-breaks.
  319. for p.pos < len(p.text) && p.text[p.pos] == ' ' {
  320. p.pos++
  321. }
  322. default:
  323. parseText()
  324. }
  325. }
  326. p.processEmphasis(p.delims.bottom)
  327. }
  328. func (p *inlineParser) consumeRun(b byte) {
  329. for p.pos < len(p.text) && p.text[p.pos] == b {
  330. p.pos++
  331. }
  332. }
  333. // Processes the (rune, int) result of utf8.Decode* so that an empty result is
  334. // converted to '\n'.
  335. func emptyToNewline(r rune, l int) rune {
  336. if l == 0 {
  337. return '\n'
  338. }
  339. return r
  340. }
  341. // Returns whether an emphasis punctuation can open or close an emphasis, when
  342. // following prev and preceding next. Start and end of file should be
  343. // represented by '\n'.
  344. //
  345. // The criteria are described in:
  346. // https://spec.commonmark.org/0.30/#emphasis-and-strong-emphasis
  347. //
  348. // The algorithm is a bit complicated. Here is another way to describe the
  349. // criteria:
  350. //
  351. // - Every rune falls into one of three categories: space, punctuation and
  352. // other. "Other" is the category of word runes in "intraword emphasis".
  353. //
  354. // - The following tables describe whether a punctuation can open or close
  355. // emphasis:
  356. //
  357. // Can open emphasis:
  358. //
  359. // | | next space | next punct | next other |
  360. // | ---------- | ---------- | ---------- | ---------- |
  361. // | prev space | | _ or * | _ or * |
  362. // | prev punct | | _ or * | _ or * |
  363. // | prev other | | | only * |
  364. //
  365. // Can close emphasis:
  366. //
  367. // | | next space | next punct | next other |
  368. // | ---------- | ---------- | ---------- | ---------- |
  369. // | prev space | | | |
  370. // | prev punct | _ or * | _ or * | |
  371. // | prev other | _ or * | _ or * | only * |
  372. func canOpenCloseEmphasis(b, prev, next rune) (bool, bool) {
  373. leftFlanking := !unicode.IsSpace(next) &&
  374. (!isUnicodePunct(next) || unicode.IsSpace(prev) || isUnicodePunct(prev))
  375. rightFlanking := !unicode.IsSpace(prev) &&
  376. (!isUnicodePunct(prev) || unicode.IsSpace(next) || isUnicodePunct(next))
  377. if b == '*' {
  378. return leftFlanking, rightFlanking
  379. }
  380. return leftFlanking && (!rightFlanking || isUnicodePunct(prev)),
  381. rightFlanking && (!leftFlanking || isUnicodePunct(next))
  382. }
  383. // Returns the starting index of the next backtick run identical to the given
  384. // run, starting from i. Returns -1 if no such run exists.
  385. func findBacktickRun(s, run string, i int) int {
  386. for i < len(s) {
  387. j := strings.Index(s[i:], run)
  388. if j == -1 {
  389. return -1
  390. }
  391. j += i
  392. if j+len(run) == len(s) || s[j+len(run)] != '`' {
  393. return j
  394. }
  395. // Too many backticks; skip over the entire run.
  396. for j += len(run); j < len(s) && s[j] == '`'; j++ {
  397. }
  398. i = j
  399. }
  400. return -1
  401. }
  402. func normalizeCodeSpanContent(s string) string {
  403. s = strings.ReplaceAll(s, "\n", " ")
  404. if len(s) > 1 && s[0] == ' ' && s[len(s)-1] == ' ' && strings.Trim(s, " ") != "" {
  405. return s[1 : len(s)-1]
  406. }
  407. return s
  408. }
  409. // https://spec.commonmark.org/0.30/#process-emphasis
  410. func (p *inlineParser) processEmphasis(bottom *delim) {
  411. var openersBottom [2][3][2]*delim
  412. for closer := bottom.next; closer != nil; {
  413. if !closer.canClose {
  414. closer = closer.next
  415. continue
  416. }
  417. openerBottom := &openersBottom[b2i(closer.typ == '_')][closer.n%3][b2i(closer.canOpen)]
  418. if *openerBottom == nil {
  419. *openerBottom = bottom
  420. }
  421. var opener *delim
  422. for p := closer.prev; p != *openerBottom && p != bottom; p = p.prev {
  423. if p.canOpen && p.typ == closer.typ &&
  424. ((!p.canClose && !closer.canOpen) ||
  425. (p.n+closer.n)%3 != 0 || (p.n%3 == 0 && closer.n%3 == 0)) {
  426. opener = p
  427. break
  428. }
  429. }
  430. if opener == nil {
  431. *openerBottom = closer.prev
  432. if !closer.canOpen {
  433. closer.prev.next = closer.next
  434. closer.next.prev = closer.prev
  435. }
  436. closer = closer.next
  437. continue
  438. }
  439. openerPiece := &p.buf.pieces[opener.bufIdx]
  440. closerPiece := &p.buf.pieces[closer.bufIdx]
  441. strong := len(openerPiece.main.Text) >= 2 && len(closerPiece.main.Text) >= 2
  442. if strong {
  443. openerPiece.main.Text = openerPiece.main.Text[2:]
  444. openerPiece.append(InlineOp{Type: OpStrongEmphasisStart})
  445. closerPiece.main.Text = closerPiece.main.Text[2:]
  446. closerPiece.prepend(InlineOp{Type: OpStrongEmphasisEnd})
  447. } else {
  448. openerPiece.main.Text = openerPiece.main.Text[1:]
  449. openerPiece.append(InlineOp{Type: OpEmphasisStart})
  450. closerPiece.main.Text = closerPiece.main.Text[1:]
  451. closerPiece.prepend(InlineOp{Type: OpEmphasisEnd})
  452. }
  453. opener.next = closer
  454. closer.prev = opener
  455. if openerPiece.main.Text == "" {
  456. opener.prev.next = opener.next
  457. opener.next.prev = opener.prev
  458. }
  459. if closerPiece.main.Text == "" {
  460. closer.prev.next = closer.next
  461. closer.next.prev = closer.prev
  462. closer = closer.next
  463. }
  464. }
  465. bottom.next = p.delims.top
  466. p.delims.top.prev = bottom
  467. }
  468. func b2i(b bool) int {
  469. if b {
  470. return 1
  471. } else {
  472. return 0
  473. }
  474. }
  475. // Stores output of inline rendering.
  476. type buffer struct {
  477. pieces []piece
  478. }
  479. func (b *buffer) push(p piece) int {
  480. b.pieces = append(b.pieces, p)
  481. return len(b.pieces) - 1
  482. }
  483. func (b *buffer) ops() []InlineOp {
  484. var ops []InlineOp
  485. for _, p := range b.pieces {
  486. p.iterate(func(op InlineOp) {
  487. if op.Type == OpText {
  488. // Convert any embedded newlines into OpNewLine, and merge
  489. // adjacent OpText's or OpRawHTML's.
  490. if op.Text == "" {
  491. return
  492. }
  493. lines := strings.Split(op.Text, "\n")
  494. if len(ops) > 0 && ops[len(ops)-1].Type == op.Type {
  495. ops[len(ops)-1].Text += lines[0]
  496. } else if lines[0] != "" {
  497. ops = append(ops, InlineOp{Type: op.Type, Text: lines[0]})
  498. }
  499. for _, line := range lines[1:] {
  500. ops = append(ops, InlineOp{Type: OpNewLine})
  501. if line != "" {
  502. ops = append(ops, InlineOp{Type: op.Type, Text: line})
  503. }
  504. }
  505. } else {
  506. ops = append(ops, op)
  507. }
  508. })
  509. }
  510. return ops
  511. }
  512. // The algorithm described in
  513. // https://spec.commonmark.org/0.30/#phase-2-inline-structure involves inserting
  514. // nodes before and after existing nodes in the output. The most natural choice
  515. // is a doubly linked list; but for simplicity, we use a slice for output nodes,
  516. // keep track of nodes that need to be prepended or appended to each node.
  517. //
  518. // TODO: Compare the performance of this data structure with doubly linked
  519. // lists.
  520. type piece struct {
  521. before []InlineOp
  522. main InlineOp
  523. after []InlineOp
  524. }
  525. func textPiece(text string) piece {
  526. return piece{main: InlineOp{Type: OpText, Text: text}}
  527. }
  528. func htmlPiece(html string) piece {
  529. return piece{main: InlineOp{Type: OpRawHTML, Text: html}}
  530. }
  531. func (p *piece) prepend(op InlineOp) { p.before = append(p.before, op) }
  532. func (p *piece) append(op InlineOp) { p.after = append(p.after, op) }
  533. func (p *piece) iterate(f func(InlineOp)) {
  534. for _, op := range p.before {
  535. f(op)
  536. }
  537. f(p.main)
  538. for i := len(p.after) - 1; i >= 0; i-- {
  539. f(p.after[i])
  540. }
  541. }
  542. // A delimiter "stack" (actually a doubly linked list), with sentinels as bottom
  543. // and top, with the bottom being the head of the list.
  544. //
  545. // https://spec.commonmark.org/0.30/#delimiter-stack
  546. type delimStack struct {
  547. bottom, top *delim
  548. }
  549. func makeDelimStack() delimStack {
  550. bottom := &delim{}
  551. top := &delim{prev: bottom}
  552. bottom.next = top
  553. return delimStack{bottom, top}
  554. }
  555. func (s *delimStack) push(n *delim) {
  556. n.prev = s.top.prev
  557. n.next = s.top
  558. s.top.prev.next = n
  559. s.top.prev = n
  560. }
  561. // A node in the delimiter "stack".
  562. type delim struct {
  563. typ byte
  564. bufIdx int
  565. prev *delim
  566. next *delim
  567. // Only used when typ is '['
  568. inactive bool
  569. // Only used when typ is '_' or '*'.
  570. n int
  571. canOpen bool
  572. canClose bool
  573. }
  574. func unlink(n *delim) {
  575. n.next.prev = n.prev
  576. n.prev.next = n.next
  577. }
  578. type linkTailParser struct {
  579. text string
  580. pos int
  581. }
  582. // Parses the link "tail", the part after the ] that closes the link text.
  583. func parseLinkTail(text string) (n int, dest, title string) {
  584. p := linkTailParser{text, 0}
  585. return p.parse()
  586. }
  587. // https://spec.commonmark.org/0.30/#links
  588. func (p *linkTailParser) parse() (n int, dest, title string) {
  589. if len(p.text) < 2 || p.text[0] != '(' {
  590. return -1, "", ""
  591. }
  592. p.pos = 1
  593. p.skipWhitespaces()
  594. if p.pos == len(p.text) {
  595. return -1, "", ""
  596. }
  597. // Parse an optional link destination.
  598. var destBuilder strings.Builder
  599. if p.text[p.pos] == '<' {
  600. p.pos++
  601. closed := false
  602. angleDest:
  603. for p.pos < len(p.text) {
  604. switch p.text[p.pos] {
  605. case '>':
  606. p.pos++
  607. closed = true
  608. break angleDest
  609. case '\n', '<':
  610. return -1, "", ""
  611. case '\\':
  612. destBuilder.WriteByte(p.parseBackslash())
  613. case '&':
  614. destBuilder.WriteString(p.parseCharRef())
  615. default:
  616. destBuilder.WriteByte(p.text[p.pos])
  617. p.pos++
  618. }
  619. }
  620. if !closed {
  621. return -1, "", ""
  622. }
  623. } else {
  624. parenBalance := 0
  625. bareDest:
  626. for p.pos < len(p.text) {
  627. if isASCIIControl(p.text[p.pos]) || p.text[p.pos] == ' ' {
  628. break
  629. }
  630. switch p.text[p.pos] {
  631. case '(':
  632. parenBalance++
  633. destBuilder.WriteByte('(')
  634. p.pos++
  635. case ')':
  636. if parenBalance == 0 {
  637. break bareDest
  638. }
  639. parenBalance--
  640. destBuilder.WriteByte(')')
  641. p.pos++
  642. case '\\':
  643. destBuilder.WriteByte(p.parseBackslash())
  644. case '&':
  645. destBuilder.WriteString(p.parseCharRef())
  646. default:
  647. destBuilder.WriteByte(p.text[p.pos])
  648. p.pos++
  649. }
  650. }
  651. if parenBalance != 0 {
  652. return -1, "", ""
  653. }
  654. }
  655. p.skipWhitespaces()
  656. var titleBuilder strings.Builder
  657. if p.pos < len(p.text) && strings.ContainsRune("'\"(", rune(p.text[p.pos])) {
  658. opener := p.text[p.pos]
  659. closer := p.text[p.pos]
  660. if closer == '(' {
  661. closer = ')'
  662. }
  663. p.pos++
  664. title:
  665. for p.pos < len(p.text) {
  666. switch p.text[p.pos] {
  667. case closer:
  668. p.pos++
  669. break title
  670. case opener:
  671. // Titles started with "(" does not allow unescaped "(":
  672. // https://spec.commonmark.org/0.30/#link-title
  673. return -1, "", ""
  674. case '\\':
  675. titleBuilder.WriteByte(p.parseBackslash())
  676. case '&':
  677. titleBuilder.WriteString(p.parseCharRef())
  678. default:
  679. titleBuilder.WriteByte(p.text[p.pos])
  680. p.pos++
  681. }
  682. }
  683. }
  684. p.skipWhitespaces()
  685. if p.pos == len(p.text) || p.text[p.pos] != ')' {
  686. return -1, "", ""
  687. }
  688. return p.pos + 1, destBuilder.String(), titleBuilder.String()
  689. }
  690. func (p *linkTailParser) skipWhitespaces() {
  691. for p.pos < len(p.text) && isWhitespace(p.text[p.pos]) {
  692. p.pos++
  693. }
  694. }
  695. func isWhitespace(b byte) bool { return b == ' ' || b == '\t' || b == '\n' }
  696. func (p *linkTailParser) parseBackslash() byte {
  697. if p.pos+1 < len(p.text) && isASCIIPunct(p.text[p.pos+1]) {
  698. b := p.text[p.pos+1]
  699. p.pos += 2
  700. return b
  701. }
  702. p.pos++
  703. return '\\'
  704. }
  705. func (p *linkTailParser) parseCharRef() string {
  706. if entity := leadingCharRef(p.text[p.pos:]); entity != "" {
  707. p.pos += len(entity)
  708. return UnescapeHTML(entity)
  709. }
  710. p.pos++
  711. return p.text[p.pos-1 : p.pos]
  712. }
  713. func isASCIILetter(b byte) bool { return ('a' <= b && b <= 'z') || ('A' <= b && b <= 'Z') }
  714. func isASCIIControl(b byte) bool { return b < 0x20 }
  715. const asciiPuncts = "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~"
  716. func isASCIIPunct(b byte) bool { return strings.IndexByte(asciiPuncts, b) >= 0 }
  717. // The CommonMark spec has its own definition of Unicode punctuation:
  718. // https://spec.commonmark.org/0.30/#unicode-punctuation-character
  719. //
  720. // This definition includes all the ASCII punctuations above, some of which
  721. // ("$+<=>^`|~" to be exact) are not considered to be punctuations by
  722. // unicode.IsPunct.
  723. func isUnicodePunct(r rune) bool {
  724. return unicode.IsPunct(r) || r <= 0x7f && isASCIIPunct(byte(r))
  725. }
  726. const metas = "![]*_`\\&<\n"
  727. func isMeta(b byte) bool { return strings.IndexByte(metas, b) >= 0 }