'])
        >>> locate_unbalanced_start(unbalanced_start, pre, post)
        >>> pre, post
        (['
 that
    we found will be effectively replaced by the div in the original
    document.  If this doesn't work out, we just throw away
    unbalanced_start without doing anything.
    """
    while 1:
        if not unbalanced_start:
            # We have totally succeeded in finding the position
            break
        finding = unbalanced_start[0]
        finding_name = finding.split()[0].strip('<>')
        if not post_delete:
            break
        next = post_delete[0]
        if next is DEL_START or not next.startswith('<'):
            # Reached a word, we can't move the delete text forward
            break
        if next[1] == '/':
            # Reached a closing tag, can we go further?  Maybe not...
            break
        name = next.split()[0].strip('<>')
        if name == 'ins':
            # Can't move into an insert
            break
        assert name != 'del', (
            "Unexpected delete tag: %r" % next)
        if name == finding_name:
            unbalanced_start.pop(0)
            pre_delete.append(post_delete.pop(0))
        else:
            # Found a tag that doesn't match
            break
def locate_unbalanced_end(unbalanced_end, pre_delete, post_delete):
    """ like locate_unbalanced_start, except handling end tags and
    possibly moving the point earlier in the document.  """
    while 1:
        if not unbalanced_end:
            # Success
            break
        finding = unbalanced_end[-1]
        finding_name = finding.split()[0].strip('<>/')
        if not pre_delete:
            break
        next = pre_delete[-1]
        if next is DEL_END or not next.startswith(''):
            # A word or a start tag
            break
        name = next.split()[0].strip('<>/')
        if name == 'ins' or name == 'del':
            # Can't move into an insert or delete
            break
        if name == finding_name:
            unbalanced_end.pop()
            post_delete.insert(0, pre_delete.pop())
        else:
            # Found a tag that doesn't match
            break
class token(_unicode):
    """ Represents a diffable token, generally a word that is displayed to
    the user.  Opening tags are attached to this token when they are
    adjacent (pre_tags) and closing tags that follow the word
    (post_tags).  Some exceptions occur when there are empty tags
    adjacent to a word, so there may be close tags in pre_tags, or
    open tags in post_tags.
    We also keep track of whether the word was originally followed by
    whitespace, even though we do not want to treat the word as
    equivalent to a similar word that does not have a trailing
    space."""
    # When this is true, the token will be eliminated from the
    # displayed diff if no change has occurred:
    hide_when_equal = False
    def __new__(cls, text, pre_tags=None, post_tags=None, trailing_whitespace=""):
        obj = _unicode.__new__(cls, text)
        if pre_tags is not None:
            obj.pre_tags = pre_tags
        else:
            obj.pre_tags = []
        if post_tags is not None:
            obj.post_tags = post_tags
        else:
            obj.post_tags = []
        obj.trailing_whitespace = trailing_whitespace
        return obj
    def __repr__(self):
        return 'token(%s, %r, %r, %r)' % (_unicode.__repr__(self), self.pre_tags,
                                          self.post_tags, self.trailing_whitespace)
    def html(self):
        return _unicode(self)
class tag_token(token):
    """ Represents a token that is actually a tag.  Currently this is just
    the 
![]()
 tag, which takes up visible space just like a word but
    is only represented in a document by a tag.  """
    def __new__(cls, tag, data, html_repr, pre_tags=None, 
                post_tags=None, trailing_whitespace=""):
        obj = token.__new__(cls, "%s: %s" % (type, data), 
                            pre_tags=pre_tags, 
                            post_tags=post_tags, 
                            trailing_whitespace=trailing_whitespace)
        obj.tag = tag
        obj.data = data
        obj.html_repr = html_repr
        return obj
    def __repr__(self):
        return 'tag_token(%s, %s, html_repr=%s, post_tags=%r, pre_tags=%r, trailing_whitespace=%r)' % (
            self.tag, 
            self.data, 
            self.html_repr, 
            self.pre_tags, 
            self.post_tags, 
            self.trailing_whitespace)
    def html(self):
        return self.html_repr
class href_token(token):
    """ Represents the href in an anchor tag.  Unlike other words, we only
    show the href when it changes.  """
    hide_when_equal = True
    def html(self):
        return ' Link: %s' % self
def tokenize(html, include_hrefs=True):
    """
    Parse the given HTML and returns token objects (words with attached tags).
    This parses only the content of a page; anything in the head is
    ignored, and the  and  elements are themselves
    optional.  The content is then parsed by lxml, which ensures the
    validity of the resulting parsed document (though lxml may make
    incorrect guesses when the markup is particular bad).
    
 and  tags are also eliminated from the document, as
    that gets confusing.
    If include_hrefs is true, then the href attribute of  tags is
    included as a special kind of diffable token."""
    if etree.iselement(html):
        body_el = html
    else:
        body_el = parse_html(html, cleanup=True)
    # Then we split the document into text chunks for each tag, word, and end tag:
    chunks = flatten_el(body_el, skip_tag=True, include_hrefs=include_hrefs)
    # Finally re-joining them into token objects:
    return fixup_chunks(chunks)
def parse_html(html, cleanup=True):
    """
    Parses an HTML fragment, returning an lxml element.  Note that the HTML will be
    wrapped in a  tag that was not in the original document.
    If cleanup is true, make sure there's no  or , and get
    rid of any 
 and  tags.
    """
    if cleanup:
        # This removes any extra markup or structure like :
        html = cleanup_html(html)
    return fragment_fromstring(html, create_parent=True)
_body_re = re.compile(r'', re.I|re.S)
_end_body_re = re.compile(r'', re.I|re.S)
_ins_del_re = re.compile(r'?(ins|del).*?>', re.I|re.S)
def cleanup_html(html):
    """ This 'cleans' the HTML, meaning that any page structure is removed
    (only the contents of  are used, if there is any  and  tags are removed.  """
    match = _body_re.search(html)
    if match:
        html = html[match.end():]
    match = _end_body_re.search(html)
    if match:
        html = html[:match.start()]
    html = _ins_del_re.sub('', html)
    return html
    
end_whitespace_re = re.compile(r'[ \t\n\r]$')
def split_trailing_whitespace(word):
    """
    This function takes a word, such as 'test\n\n' and returns ('test','\n\n')
    """
    stripped_length = len(word.rstrip())
    return word[0:stripped_length], word[stripped_length:]
def fixup_chunks(chunks):
    """
    This function takes a list of chunks and produces a list of tokens.
    """
    tag_accum = []
    cur_word = None
    result = []
    for chunk in chunks:
        if isinstance(chunk, tuple):
            if chunk[0] == 'img':
                src = chunk[1]
                tag, trailing_whitespace = split_trailing_whitespace(chunk[2])
                cur_word = tag_token('img', src, html_repr=tag,
                                     pre_tags=tag_accum,
                                     trailing_whitespace=trailing_whitespace)
                tag_accum = []
                result.append(cur_word)
            elif chunk[0] == 'href':
                href = chunk[1]
                cur_word = href_token(href, pre_tags=tag_accum, trailing_whitespace=" ")
                tag_accum = []
                result.append(cur_word)
            continue
        if is_word(chunk):
            chunk, trailing_whitespace = split_trailing_whitespace(chunk)
            cur_word = token(chunk, pre_tags=tag_accum, trailing_whitespace=trailing_whitespace)
            tag_accum = []
            result.append(cur_word)
        elif is_start_tag(chunk):
            tag_accum.append(chunk)
        elif is_end_tag(chunk):
            if tag_accum:
                tag_accum.append(chunk)
            else:
                assert cur_word, (
                    "Weird state, cur_word=%r, result=%r, chunks=%r of %r"
                    % (cur_word, result, chunk, chunks))
                cur_word.post_tags.append(chunk)
        else:
            assert False
    if not result:
        return [token('', pre_tags=tag_accum)]
    else:
        result[-1].post_tags.extend(tag_accum)
    return result
# All the tags in HTML that don't require end tags:
empty_tags = (
    'param', 'img', 'area', 'br', 'basefont', 'input',
    'base', 'meta', 'link', 'col')
block_level_tags = (
    'address',
    'blockquote',
    'center',
    'dir',
    'div',
    'dl',
    'fieldset',
    'form',
    'h1',
    'h2',
    'h3',
    'h4',
    'h5',
    'h6',
    'hr',
    'isindex',
    'menu',
    'noframes',
    'noscript',
    'ol',
    'p',
    'pre',
    'table',
    'ul',
    )
block_level_container_tags = (
    'dd',
    'dt',
    'frameset',
    'li',
    'tbody',
    'td',
    'tfoot',
    'th',
    'thead',
    'tr',
    )
def flatten_el(el, include_hrefs, skip_tag=False):
    """ Takes an lxml element el, and generates all the text chunks for
    that tag.  Each start tag is a chunk, each word is a chunk, and each
    end tag is a chunk.
    If skip_tag is true, then the outermost container tag is
    not returned (just its contents)."""
    if not skip_tag:
        if el.tag == 'img':
            yield ('img', el.get('src'), start_tag(el))
        else:
            yield start_tag(el)
    if el.tag in empty_tags and not el.text and not len(el) and not el.tail:
        return
    start_words = split_words(el.text)
    for word in start_words:
        yield html_escape(word)
    for child in el:
        for item in flatten_el(child, include_hrefs=include_hrefs):
            yield item
    if el.tag == 'a' and el.get('href') and include_hrefs:
        yield ('href', el.get('href'))
    if not skip_tag:
        yield end_tag(el)
        end_words = split_words(el.tail)
        for word in end_words:
            yield html_escape(word)
split_words_re = re.compile(r'\S+(?:\s+|$)', re.U)
def split_words(text):
    """ Splits some text into words. Includes trailing whitespace
    on each word when appropriate.  """
    if not text or not text.strip():
        return []
    words = split_words_re.findall(text)
    return words
start_whitespace_re = re.compile(r'^[ \t\n\r]')
def start_tag(el):
    """
    The text representation of the start tag for a tag.
    """
    return '<%s%s>' % (
        el.tag, ''.join([' %s="%s"' % (name, html_escape(value, True))
                         for name, value in el.attrib.items()]))
def end_tag(el):
    """ The text representation of an end tag for a tag.  Includes
    trailing whitespace when appropriate.  """
    if el.tail and start_whitespace_re.search(el.tail):
        extra = ' '
    else:
        extra = ''
    return '%s>%s' % (el.tag, extra)
def is_word(tok):
    return not tok.startswith('<')
def is_end_tag(tok):
    return tok.startswith('')
def is_start_tag(tok):
    return tok.startswith('<') and not tok.startswith('')
def fixup_ins_del_tags(html):
    """ Given an html string, move any  or  tags inside of any
    block-level elements, e.g. transform word
 to
    word
 """
    doc = parse_html(html, cleanup=False)
    _fixup_ins_del_tags(doc)
    html = serialize_html_fragment(doc, skip_outer=True)
    return html
def serialize_html_fragment(el, skip_outer=False):
    """ Serialize a single lxml element as HTML.  The serialized form
    includes the elements tail.  
    If skip_outer is true, then don't serialize the outermost tag
    """
    assert not isinstance(el, basestring), (
        "You should pass in an element, not a string like %r" % el)
    html = etree.tostring(el, method="html", encoding=_unicode)
    if skip_outer:
        # Get rid of the extra starting tag:
        html = html[html.find('>')+1:]
        # Get rid of the extra end tag:
        html = html[:html.rfind('<')]
        return html.strip()
    else:
        return html
def _fixup_ins_del_tags(doc):
    """fixup_ins_del_tags that works on an lxml document in-place
    """
    for tag in ['ins', 'del']:
        for el in doc.xpath('descendant-or-self::%s' % tag):
            if not _contains_block_level_tag(el):
                continue
            _move_el_inside_block(el, tag=tag)
            el.drop_tag()
            #_merge_element_contents(el)
def _contains_block_level_tag(el):
    """True if the element contains any block-level elements, like , 
, etc.
    """
    if el.tag in block_level_tags or el.tag in block_level_container_tags:
        return True
    for child in el:
        if _contains_block_level_tag(child):
            return True
    return False
def _move_el_inside_block(el, tag):
    """ helper for _fixup_ins_del_tags; actually takes the  etc tags
    and moves them inside any block-level tags.  """
    for child in el:
        if _contains_block_level_tag(child):
            break
    else:
        # No block-level tags in any child
        children_tag = etree.Element(tag)
        children_tag.text = el.text
        el.text = None
        children_tag.extend(list(el))
        el[:] = [children_tag]
        return
    for child in list(el):
        if _contains_block_level_tag(child):
            _move_el_inside_block(child, tag)
            if child.tail:
                tail_tag = etree.Element(tag)
                tail_tag.text = child.tail
                child.tail = None
                el.insert(el.index(child)+1, tail_tag)
        else:
            child_tag = etree.Element(tag)
            el.replace(child, child_tag)
            child_tag.append(child)
    if el.text:
        text_tag = etree.Element(tag)
        text_tag.text = el.text
        el.text = None
        el.insert(0, text_tag)
            
def _merge_element_contents(el):
    """
    Removes an element, but merges its contents into its place, e.g.,
    given Hi there!, if you remove the  element you get Hi there!"""
    parent = el.getparent()
    text = el.text or ''
    if el.tail:
        if not len(el):
            text += el.tail
        else:
            if el[-1].tail:
                el[-1].tail += el.tail
            else:
                el[-1].tail = el.tail
    index = parent.index(el)
    if text:
        if index == 0:
            previous = None
        else:
            previous = parent[index-1]
        if previous is None:
            if parent.text:
                parent.text += text
            else:
                parent.text = text
        else:
            if previous.tail:
                previous.tail += text
            else:
                previous.tail = text
    parent[index:index+1] = el.getchildren()
class InsensitiveSequenceMatcher(difflib.SequenceMatcher):
    """
    Acts like SequenceMatcher, but tries not to find very small equal
    blocks amidst large spans of changes
    """
    threshold = 2
    
    def get_matching_blocks(self):
        size = min(len(self.b), len(self.b))
        threshold = min(self.threshold, size / 4)
        actual = difflib.SequenceMatcher.get_matching_blocks(self)
        return [item for item in actual
                if item[2] > threshold
                or not item[2]]
if __name__ == '__main__':
    from lxml.html import _diffcommand
    _diffcommand.main() |