Source code for bleach

# -*- coding: utf-8 -*-

from __future__ import unicode_literals

import warnings
from pkg_resources import parse_version

from bleach.linkifier import (
    DEFAULT_CALLBACKS,
    Linker,
)
from bleach.sanitizer import (
    ALLOWED_ATTRIBUTES,
    ALLOWED_PROTOCOLS,
    ALLOWED_STYLES,
    ALLOWED_TAGS,
    Cleaner,
)


import html5lib
try:
    _html5lib_version = html5lib.__version__.split('.')
    if len(_html5lib_version) < 2:
        _html5lib_version = _html5lib_version + ['0']
except Exception:
    _h5ml5lib_version = ['unknown', 'unknown']


# Bleach 3.0.0 won't support html5lib-python < 1.0.0.
if _html5lib_version < ['1', '0'] or 'b' in _html5lib_version[1]:
    warnings.warn('Support for html5lib-python < 1.0.0 is deprecated.', DeprecationWarning)


# yyyymmdd
__releasedate__ = '20171207'
# x.y.z or x.y.z.dev0 -- semver
__version__ = '2.1.2'
VERSION = parse_version(__version__)


__all__ = ['clean', 'linkify']


[docs]def clean(text, tags=ALLOWED_TAGS, attributes=ALLOWED_ATTRIBUTES, styles=ALLOWED_STYLES, protocols=ALLOWED_PROTOCOLS, strip=False, strip_comments=True): """Clean an HTML fragment of malicious content and return it This function is a security-focused function whose sole purpose is to remove malicious content from a string such that it can be displayed as content in a web page. This function is not designed to use to transform content to be used in non-web-page contexts. Example:: import bleach better_text = bleach.clean(yucky_text) .. Note:: If you're cleaning a lot of text and passing the same argument values or you want more configurability, consider using a :py:class:`bleach.sanitizer.Cleaner` instance. :arg str text: the text to clean :arg list tags: allowed list of tags; defaults to ``bleach.sanitizer.ALLOWED_TAGS`` :arg dict attributes: allowed attributes; can be a callable, list or dict; defaults to ``bleach.sanitizer.ALLOWED_ATTRIBUTES`` :arg list styles: allowed list of css styles; defaults to ``bleach.sanitizer.ALLOWED_STYLES`` :arg list protocols: allowed list of protocols for links; defaults to ``bleach.sanitizer.ALLOWED_PROTOCOLS`` :arg bool strip: whether or not to strip disallowed elements :arg bool strip_comments: whether or not to strip HTML comments :returns: cleaned text as unicode """ cleaner = Cleaner( tags=tags, attributes=attributes, styles=styles, protocols=protocols, strip=strip, strip_comments=strip_comments, ) return cleaner.clean(text)
[docs]def linkify(text, callbacks=DEFAULT_CALLBACKS, skip_tags=None, parse_email=False): """Convert URL-like strings in an HTML fragment to links This function converts strings that look like URLs, domain names and email addresses in text that may be an HTML fragment to links, while preserving: 1. links already in the string 2. urls found in attributes 3. email addresses linkify does a best-effort approach and tries to recover from bad situations due to crazy text. .. Note:: If you're linking a lot of text and passing the same argument values or you want more configurability, consider using a :py:class:`bleach.linkifier.Linker` instance. .. Note:: If you have text that you want to clean and then linkify, consider using the :py:class:`bleach.linkifier.LinkifyFilter` as a filter in the clean pass. That way you're not parsing the HTML twice. :arg str text: the text to linkify :arg list callbacks: list of callbacks to run when adjusting tag attributes; defaults to ``bleach.linkifier.DEFAULT_CALLBACKS`` :arg list skip_tags: list of tags that you don't want to linkify the contents of; for example, you could set this to ``['pre']`` to skip linkifying contents of ``pre`` tags :arg bool parse_email: whether or not to linkify email addresses :returns: linkified text as unicode """ linker = Linker( callbacks=callbacks, skip_tags=skip_tags, parse_email=parse_email ) return linker.linkify(text)