Source code for commonmark.blocks

from __future__ import absolute_import, unicode_literals

import re
from commonmark import common
from commonmark.common import unescape_string
from commonmark.inlines import InlineParser
from commonmark.node import Node


CODE_INDENT = 4
reHtmlBlockOpen = [
    re.compile(r'.'),  # dummy for 0
    re.compile(r'^<(?:script|pre|style)(?:\s|>|$)', re.IGNORECASE),
    re.compile(r'^<!--'),
    re.compile(r'^<[?]'),
    re.compile(r'^<![A-Z]'),
    re.compile(r'^<!\[CDATA\['),
    re.compile(
        r'^<[/]?(?:address|article|aside|base|basefont|blockquote|body|'
        r'caption|center|col|colgroup|dd|details|dialog|dir|div|dl|dt|'
        r'fieldset|figcaption|figure|footer|form|frame|frameset|h1|head|'
        r'header|hr|html|iframe|legend|li|link|main|menu|menuitem|'
        r'nav|noframes|ol|optgroup|option|p|param|section|source|title|'
        r'summary|table|tbody|td|tfoot|th|thead|title|tr|track|ul)'
        r'(?:\s|[/]?[>]|$)',
        re.IGNORECASE),
    re.compile(
        '^(?:' + common.OPENTAG + '|' + common.CLOSETAG + ')\\s*$',
        re.IGNORECASE),
]
reHtmlBlockClose = [
    re.compile(r'.'),  # dummy for 0
    re.compile(r'<\/(?:script|pre|style)>', re.IGNORECASE),
    re.compile(r'-->'),
    re.compile(r'\?>'),
    re.compile(r'>'),
    re.compile(r'\]\]>'),
]
reThematicBreak = re.compile(
    r'^(?:(?:\*[ \t]*){3,}|(?:_[ \t]*){3,}|(?:-[ \t]*){3,})[ \t]*$')
reMaybeSpecial = re.compile(r'^[#`~*+_=<>0-9-]')
reNonSpace = re.compile(r'[^ \t\f\v\r\n]')
reBulletListMarker = re.compile(r'^[*+-]')
reOrderedListMarker = re.compile(r'^(\d{1,9})([.)])')
reATXHeadingMarker = re.compile(r'^#{1,6}(?:[ \t]+|$)')
reCodeFence = re.compile(r'^`{3,}(?!.*`)|^~{3,}')
reClosingCodeFence = re.compile(r'^(?:`{3,}|~{3,})(?= *$)')
reSetextHeadingLine = re.compile(r'^(?:=+|-+)[ \t]*$')
reLineEnding = re.compile(r'\r\n|\n|\r')


def is_blank(s):
    """Returns True if string contains only space characters."""
    return re.search(reNonSpace, s) is None


def is_space_or_tab(s):
    return s in (' ', '\t')


def peek(ln, pos):
    if pos < len(ln):
        return ln[pos]
    else:
        return None


def ends_with_blank_line(block):
    """ Returns true if block ends with a blank line,
    descending if needed into lists and sublists."""
    while block:
        if block.last_line_blank:
            return True
        if not block.last_line_checked and \
                block.t in ('list', 'item'):
            block.last_line_checked = True
            block = block.last_child
        else:
            block.last_line_checked = True
            break

    return False


def parse_list_marker(parser, container):
    """ Parse a list marker and return data on the marker (type,
    start, delimiter, bullet character, padding) or None."""
    rest = parser.current_line[parser.next_nonspace:]
    data = {
        'type': None,
        'tight': True,  # lists are tight by default
        'bullet_char': None,
        'start': None,
        'delimiter': None,
        'padding': None,
        'marker_offset': parser.indent,
    }
    if parser.indent >= 4:
        return None
    m = re.search(reBulletListMarker, rest)
    m2 = re.search(reOrderedListMarker, rest)
    if m:
        data['type'] = 'bullet'
        data['bullet_char'] = m.group()[0]
    elif m2 and (container.t != 'paragraph' or m2.group(1) == '1'):
        m = m2
        data['type'] = 'ordered'
        data['start'] = int(m.group(1))
        data['delimiter'] = m.group(2)
    else:
        return None

    # make sure we have spaces after
    nextc = peek(parser.current_line, parser.next_nonspace + len(m.group()))
    if not (nextc is None or nextc == '\t' or nextc == ' '):
        return None

    # if it interrupts paragraph, make sure first line isn't blank
    if container.t == 'paragraph' and \
       not re.search(
           reNonSpace,
           parser.current_line[parser.next_nonspace + len(m.group()):]):
        return None

    # we've got a match! advance offset and calculate padding
    parser.advance_next_nonspace()  # to start of marker
    parser.advance_offset(len(m.group()), True)  # to end of marker
    spaces_start_col = parser.column
    spaces_start_offset = parser.offset
    while True:
        parser.advance_offset(1, True)
        nextc = peek(parser.current_line, parser.offset)
        if parser.column - spaces_start_col < 5 and \
           is_space_or_tab(nextc):
            pass
        else:
            break
    blank_item = peek(parser.current_line, parser.offset) is None
    spaces_after_marker = parser.column - spaces_start_col
    if spaces_after_marker >= 5 or \
       spaces_after_marker < 1 or \
       blank_item:
        data['padding'] = len(m.group()) + 1
        parser.column = spaces_start_col
        parser.offset = spaces_start_offset
        if is_space_or_tab(peek(parser.current_line, parser.offset)):
            parser.advance_offset(1, True)
    else:
        data['padding'] = len(m.group()) + spaces_after_marker

    return data


def lists_match(list_data, item_data):
    """
    Returns True if the two list items are of the same type,
    with the same delimiter and bullet character.  This is used
    in agglomerating list items into lists.
    """
    return list_data.get('type') == item_data.get('type') and \
        list_data.get('delimiter') == item_data.get('delimiter') and \
        list_data.get('bullet_char') == item_data.get('bullet_char')


class Block(object):
    accepts_lines = None

    @staticmethod
    def continue_(parser=None, container=None):
        return

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return


class Document(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return t != 'item'


class List(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        item = block.first_child
        while item:
            # check for non-final list item ending with blank line:
            if ends_with_blank_line(item) and item.nxt:
                block.list_data['tight'] = False
                break
            # recurse into children of list item, to see if there are
            # spaces between any of them:
            subitem = item.first_child
            while subitem:
                if ends_with_blank_line(subitem) and \
                   (item.nxt or subitem.nxt):
                    block.list_data['tight'] = False
                    break
                subitem = subitem.nxt
            item = item.nxt

    @staticmethod
    def can_contain(t):
        return t == 'item'


class BlockQuote(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        ln = parser.current_line
        if not parser.indented and peek(ln, parser.next_nonspace) == '>':
            parser.advance_next_nonspace()
            parser.advance_offset(1, False)
            if is_space_or_tab(peek(ln, parser.offset)):
                parser.advance_offset(1, True)
        else:
            return 1
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return t != 'item'


class Item(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        if parser.blank:
            if container.first_child is None:
                # Blank line after empty list item
                return 1
            else:
                parser.advance_next_nonspace()
        elif parser.indent >= (container.list_data['marker_offset'] +
                               container.list_data['padding']):
            parser.advance_offset(
                container.list_data['marker_offset'] +
                container.list_data['padding'], True)
        else:
            return 1
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return t != 'item'


class Heading(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        # A heading can never container > 1 line, so fail to match:
        return 1

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return False


class ThematicBreak(Block):
    accepts_lines = False

    @staticmethod
    def continue_(parser=None, container=None):
        # A thematic break can never container > 1 line, so fail to match:
        return 1

    @staticmethod
    def finalize(parser=None, block=None):
        return

    @staticmethod
    def can_contain(t):
        return False


class CodeBlock(Block):
    accepts_lines = True

    @staticmethod
    def continue_(parser=None, container=None):
        ln = parser.current_line
        indent = parser.indent
        if container.is_fenced:
            match = indent <= 3 and \
                len(ln) >= parser.next_nonspace + 1 and \
                ln[parser.next_nonspace] == container.fence_char and \
                re.search(reClosingCodeFence, ln[parser.next_nonspace:])
            if match and len(match.group()) >= container.fence_length:
                # closing fence - we're at end of line, so we can return
                parser.finalize(container, parser.line_number)
                return 2
            else:
                # skip optional spaces of fence offset
                i = container.fence_offset
                while i > 0 and is_space_or_tab(peek(ln, parser.offset)):
                    parser.advance_offset(1, True)
                    i -= 1
        else:
            # indented
            if indent >= CODE_INDENT:
                parser.advance_offset(CODE_INDENT, True)
            elif parser.blank:
                parser.advance_next_nonspace()
            else:
                return 1
        return 0

    @staticmethod
    def finalize(parser=None, block=None):
        if block.is_fenced:
            # first line becomes info string
            content = block.string_content
            newline_pos = content.index('\n')
            first_line = content[0:newline_pos]
            rest = content[newline_pos + 1:]
            block.info = unescape_string(first_line.strip())
            block.literal = rest
        else:
            # indented
            block.literal = re.sub(r'(\n *)+$', '\n', block.string_content)

        block.string_content = None

    @staticmethod
    def can_contain(t):
        return False


class HtmlBlock(Block):
    accepts_lines = True

    @staticmethod
    def continue_(parser=None, container=None):
        if parser.blank and (container.html_block_type == 6 or
                             container.html_block_type == 7):
            return 1
        else:
            return 0

    @staticmethod
    def finalize(parser=None, block=None):
        block.literal = re.sub(r'(\n *)+$', '', block.string_content)
        # allow GC
        block.string_content = None

    @staticmethod
    def can_contain(t):
        return False


class Paragraph(Block):
    accepts_lines = True

    @staticmethod
    def continue_(parser=None, container=None):
        return 1 if parser.blank else 0

    @staticmethod
    def finalize(parser=None, block=None):
        has_reference_defs = False

        # try parsing the beginning as link reference definitions:
        while peek(block.string_content, 0) == '[':
            pos = parser.inline_parser.parseReference(
                block.string_content, parser.refmap)
            if not pos:
                break
            block.string_content = block.string_content[pos:]
            has_reference_defs = True
        if has_reference_defs and is_blank(block.string_content):
            block.unlink()

    @staticmethod
    def can_contain(t):
        return False


class BlockStarts(object):
    """Block start functions.

    Return values:
    0 = no match
    1 = matched container, keep going
    2 = matched leaf, no more block starts
    """
    METHODS = [
        'block_quote',
        'atx_heading',
        'fenced_code_block',
        'html_block',
        'setext_heading',
        'thematic_break',
        'list_item',
        'indented_code_block',
    ]

    @staticmethod
    def block_quote(parser, container=None):
        if not parser.indented and \
           peek(parser.current_line, parser.next_nonspace) == '>':
            parser.advance_next_nonspace()
            parser.advance_offset(1, False)
            # optional following space
            if is_space_or_tab(peek(parser.current_line, parser.offset)):
                parser.advance_offset(1, True)
            parser.close_unmatched_blocks()
            parser.add_child('block_quote', parser.next_nonspace)
            return 1

        return 0

    @staticmethod
    def atx_heading(parser, container=None):
        if not parser.indented:
            m = re.search(reATXHeadingMarker,
                          parser.current_line[parser.next_nonspace:])
            if m:
                parser.advance_next_nonspace()
                parser.advance_offset(len(m.group()), False)
                parser.close_unmatched_blocks()
                container = parser.add_child('heading', parser.next_nonspace)
                # number of #s
                container.level = len(m.group().strip())
                # remove trailing ###s:
                container.string_content = re.sub(
                    r'[ \t]+#+[ \t]*$', '', re.sub(
                        r'^[ \t]*#+[ \t]*$',
                        '',
                        parser.current_line[parser.offset:]))
                parser.advance_offset(
                    len(parser.current_line) - parser.offset, False)
                return 2

        return 0

    @staticmethod
    def fenced_code_block(parser, container=None):
        if not parser.indented:
            m = re.search(
                reCodeFence,
                parser.current_line[parser.next_nonspace:])
            if m:
                fence_length = len(m.group())
                parser.close_unmatched_blocks()
                container = parser.add_child(
                    'code_block', parser.next_nonspace)
                container.is_fenced = True
                container.fence_length = fence_length
                container.fence_char = m.group()[0]
                container.fence_offset = parser.indent
                parser.advance_next_nonspace()
                parser.advance_offset(fence_length, False)
                return 2

        return 0

    @staticmethod
    def html_block(parser, container=None):
        if not parser.indented and \
           peek(parser.current_line, parser.next_nonspace) == '<':
            s = parser.current_line[parser.next_nonspace:]

            for block_type in range(1, 8):
                if re.search(reHtmlBlockOpen[block_type], s) and \
                   (block_type < 7 or container.t != 'paragraph'):
                    parser.close_unmatched_blocks()
                    # We don't adjust parser.offset;
                    # spaces are part of the HTML block:
                    b = parser.add_child('html_block', parser.offset)
                    b.html_block_type = block_type
                    return 2
        return 0

    @staticmethod
    def setext_heading(parser, container=None):
        if not parser.indented and container.t == 'paragraph':
            m = re.search(
                reSetextHeadingLine,
                parser.current_line[parser.next_nonspace:])
            if m:
                parser.close_unmatched_blocks()
                # resolve reference link definitiosn
                while peek(container.string_content, 0) == '[':
                    pos = parser.inline_parser.parseReference(
                            container.string_content, parser.refmap)
                    if not pos:
                        break
                    container.string_content = container.string_content[pos:]
                if container.string_content:
                    heading = Node('heading', container.sourcepos)
                    heading.level = 1 if m.group()[0] == '=' else 2
                    heading.string_content = container.string_content
                    container.insert_after(heading)
                    container.unlink()
                    parser.tip = heading
                    parser.advance_offset(
                        len(parser.current_line) - parser.offset, False)
                    return 2
                else:
                    return 0

        return 0

    @staticmethod
    def thematic_break(parser, container=None):
        if not parser.indented and re.search(
                reThematicBreak, parser.current_line[parser.next_nonspace:]):
            parser.close_unmatched_blocks()
            parser.add_child('thematic_break', parser.next_nonspace)
            parser.advance_offset(
                len(parser.current_line) - parser.offset, False)
            return 2
        return 0

    @staticmethod
    def list_item(parser, container=None):
        if (not parser.indented or container.t == 'list'):
            data = parse_list_marker(parser, container)
            if data:
                parser.close_unmatched_blocks()

                # add the list if needed
                if parser.tip.t != 'list' or \
                   not lists_match(container.list_data, data):
                    container = parser.add_child('list', parser.next_nonspace)
                    container.list_data = data

                # add the list item
                container = parser.add_child('item', parser.next_nonspace)
                container.list_data = data
                return 1

        return 0

    @staticmethod
    def indented_code_block(parser, container=None):
        if parser.indented and \
           parser.tip.t != 'paragraph' and \
                           not parser.blank:
            # indented code
            parser.advance_offset(CODE_INDENT, True)
            parser.close_unmatched_blocks()
            parser.add_child('code_block', parser.offset)
            return 2

        return 0


[docs]class Parser(object): def __init__(self, options={}): self.doc = Node('document', [[1, 1], [0, 0]]) self.block_starts = BlockStarts() self.tip = self.doc self.oldtip = self.doc self.current_line = '' self.line_number = 0 self.offset = 0 self.column = 0 self.next_nonspace = 0 self.next_nonspace_column = 0 self.indent = 0 self.indented = False self.blank = False self.partially_consumed_tab = False self.all_closed = True self.last_matched_container = self.doc self.refmap = {} self.last_line_length = 0 self.inline_parser = InlineParser(options) self.options = options
[docs] def add_line(self): """ Add a line to the block at the tip. We assume the tip can accept lines -- that check should be done before calling this.""" if self.partially_consumed_tab: # Skip over tab self.offset += 1 # Add space characters chars_to_tab = 4 - (self.column % 4) self.tip.string_content += (' ' * chars_to_tab) self.tip.string_content += (self.current_line[self.offset:] + '\n')
[docs] def add_child(self, tag, offset): """ Add block of type tag as a child of the tip. If the tip can't accept children, close and finalize it and try its parent, and so on til we find a block that can accept children.""" while not self.blocks[self.tip.t].can_contain(tag): self.finalize(self.tip, self.line_number - 1) column_number = offset + 1 new_block = Node(tag, [[self.line_number, column_number], [0, 0]]) new_block.string_content = '' self.tip.append_child(new_block) self.tip = new_block return new_block
[docs] def close_unmatched_blocks(self): """Finalize and close any unmatched blocks.""" if not self.all_closed: while self.oldtip != self.last_matched_container: parent = self.oldtip.parent self.finalize(self.oldtip, self.line_number - 1) self.oldtip = parent self.all_closed = True
def find_next_nonspace(self): current_line = self.current_line i = self.offset cols = self.column try: c = current_line[i] except IndexError: c = '' while c != '': if c == ' ': i += 1 cols += 1 elif c == '\t': i += 1 cols += (4 - (cols % 4)) else: break try: c = current_line[i] except IndexError: c = '' self.blank = (c == '\n' or c == '\r' or c == '') self.next_nonspace = i self.next_nonspace_column = cols self.indent = self.next_nonspace_column - self.column self.indented = self.indent >= CODE_INDENT def advance_next_nonspace(self): self.offset = self.next_nonspace self.column = self.next_nonspace_column self.partially_consumed_tab = False def advance_offset(self, count, columns): current_line = self.current_line try: c = current_line[self.offset] except IndexError: c = None while count > 0 and c is not None: if c == '\t': chars_to_tab = 4 - (self.column % 4) if columns: self.partially_consumed_tab = chars_to_tab > count chars_to_advance = min(count, chars_to_tab) self.column += chars_to_advance self.offset += 0 if self.partially_consumed_tab else 1 count -= chars_to_advance else: self.partially_consumed_tab = False self.column += chars_to_tab self.offset += 1 count -= 1 else: self.partially_consumed_tab = False self.offset += 1 # assume ascii; block starts are ascii self.column += 1 count -= 1 try: c = current_line[self.offset] except IndexError: c = None
[docs] def incorporate_line(self, ln): """Analyze a line of text and update the document appropriately. We parse markdown text by calling this on each line of input, then finalizing the document. """ all_matched = True container = self.doc self.oldtip = self.tip self.offset = 0 self.column = 0 self.blank = False self.partially_consumed_tab = False self.line_number += 1 # replace NUL characters for security if re.search(r'\u0000', ln) is not None: ln = re.sub(r'\0', '\uFFFD', ln) self.current_line = ln # For each containing block, try to parse the associated line start. # Bail out on failure: container will point to the last matching block. # Set all_matched to false if not all containers match. while True: last_child = container.last_child if not (last_child and last_child.is_open): break container = last_child self.find_next_nonspace() rv = self.blocks[container.t].continue_(self, container) if rv == 0: # we've matched, keep going pass elif rv == 1: # we've failed to match a block all_matched = False elif rv == 2: # we've hit end of line for fenced code close and can return self.last_line_length = len(ln) return else: raise ValueError( 'continue_ returned illegal value, must be 0, 1, or 2') if not all_matched: # back up to last matching block container = container.parent break self.all_closed = (container == self.oldtip) self.last_matched_container = container matched_leaf = container.t != 'paragraph' and \ self.blocks[container.t].accepts_lines starts = self.block_starts starts_len = len(starts.METHODS) # Unless last matched container is a code block, try new container # starts, adding children to the last matched container: while not matched_leaf: self.find_next_nonspace() # this is a little performance optimization: if not self.indented and \ not re.search(reMaybeSpecial, ln[self.next_nonspace:]): self.advance_next_nonspace() break i = 0 while i < starts_len: res = getattr(starts, starts.METHODS[i])(self, container) if res == 1: container = self.tip break elif res == 2: container = self.tip matched_leaf = True break else: i += 1 if i == starts_len: # nothing matched self.advance_next_nonspace() break # What remains at the offset is a text line. Add the text to the # appropriate container. if not self.all_closed and not self.blank and \ self.tip.t == 'paragraph': # lazy paragraph continuation self.add_line() else: # not a lazy continuation # finalize any blocks not matched self.close_unmatched_blocks() if self.blank and container.last_child: container.last_child.last_line_blank = True t = container.t # Block quote lines are never blank as they start with > # and we don't count blanks in fenced code for purposes of # tight/loose lists or breaking out of lists. We also # don't set last_line_blank on an empty list item, or if we # just closed a fenced block. last_line_blank = self.blank and \ not (t == 'block_quote' or (t == 'code_block' and container.is_fenced) or (t == 'item' and not container.first_child and container.sourcepos[0][0] == self.line_number)) # propagate last_line_blank up through parents: cont = container while cont: cont.last_line_blank = last_line_blank cont = cont.parent if self.blocks[t].accepts_lines: self.add_line() # if HtmlBlock, check for end condition if t == 'html_block' and \ container.html_block_type >= 1 and \ container.html_block_type <= 5 and \ re.search( reHtmlBlockClose[container.html_block_type], self.current_line[self.offset:]): self.finalize(container, self.line_number) elif self.offset < len(ln) and not self.blank: # create a paragraph container for one line container = self.add_child('paragraph', self.offset) self.advance_next_nonspace() self.add_line() self.last_line_length = len(ln)
[docs] def finalize(self, block, line_number): """ Finalize a block. Close it and do any necessary postprocessing, e.g. creating string_content from strings, setting the 'tight' or 'loose' status of a list, and parsing the beginnings of paragraphs for reference definitions. Reset the tip to the parent of the closed block.""" above = block.parent block.is_open = False block.sourcepos[1] = [line_number, self.last_line_length] self.blocks[block.t].finalize(self, block) self.tip = above
[docs] def process_inlines(self, block): """ Walk through a block & children recursively, parsing string content into inline content where appropriate. """ walker = block.walker() self.inline_parser.refmap = self.refmap self.inline_parser.options = self.options event = walker.nxt() while event is not None: node = event['node'] t = node.t if not event['entering'] and (t == 'paragraph' or t == 'heading'): self.inline_parser.parse(node) event = walker.nxt()
[docs] def parse(self, my_input): """ The main parsing function. Returns a parsed document AST.""" self.doc = Node('document', [[1, 1], [0, 0]]) self.tip = self.doc self.refmap = {} self.line_number = 0 self.last_line_length = 0 self.offset = 0 self.column = 0 self.last_matched_container = self.doc self.current_line = '' lines = re.split(reLineEnding, my_input) length = len(lines) if len(my_input) > 0 and my_input[-1] == '\n': # ignore last blank line created by final newline length -= 1 for i in range(length): self.incorporate_line(lines[i]) while (self.tip): self.finalize(self.tip, length) self.process_inlines(self.doc) return self.doc
CAMEL_RE = re.compile("(.)([A-Z](?:[a-z]+|(?<=[a-z0-9].)))") Parser.blocks = dict( (CAMEL_RE.sub(r'\1_\2', cls.__name__).lower(), cls) for cls in Block.__subclasses__())