File manager - Edit - /home/newsbmcs.com/public_html/static/img/logo/treewalkers.tar
Back
base.py 0000644 00000016464 15030125115 0006034 0 ustar 00 from __future__ import absolute_import, division, unicode_literals from xml.dom import Node from ..constants import namespaces, voidElements, spaceCharacters __all__ = ["DOCUMENT", "DOCTYPE", "TEXT", "ELEMENT", "COMMENT", "ENTITY", "UNKNOWN", "TreeWalker", "NonRecursiveTreeWalker"] DOCUMENT = Node.DOCUMENT_NODE DOCTYPE = Node.DOCUMENT_TYPE_NODE TEXT = Node.TEXT_NODE ELEMENT = Node.ELEMENT_NODE COMMENT = Node.COMMENT_NODE ENTITY = Node.ENTITY_NODE UNKNOWN = "<#UNKNOWN#>" spaceCharacters = "".join(spaceCharacters) class TreeWalker(object): """Walks a tree yielding tokens Tokens are dicts that all have a ``type`` field specifying the type of the token. """ def __init__(self, tree): """Creates a TreeWalker :arg tree: the tree to walk """ self.tree = tree def __iter__(self): raise NotImplementedError def error(self, msg): """Generates an error token with the given message :arg msg: the error message :returns: SerializeError token """ return {"type": "SerializeError", "data": msg} def emptyTag(self, namespace, name, attrs, hasChildren=False): """Generates an EmptyTag token :arg namespace: the namespace of the token--can be ``None`` :arg name: the name of the element :arg attrs: the attributes of the element as a dict :arg hasChildren: whether or not to yield a SerializationError because this tag shouldn't have children :returns: EmptyTag token """ yield {"type": "EmptyTag", "name": name, "namespace": namespace, "data": attrs} if hasChildren: yield self.error("Void element has children") def startTag(self, namespace, name, attrs): """Generates a StartTag token :arg namespace: the namespace of the token--can be ``None`` :arg name: the name of the element :arg attrs: the attributes of the element as a dict :returns: StartTag token """ return {"type": "StartTag", "name": name, "namespace": namespace, "data": attrs} def endTag(self, namespace, name): """Generates an EndTag token :arg namespace: the namespace of the token--can be ``None`` :arg name: the name of the element :returns: EndTag token """ return {"type": "EndTag", "name": name, "namespace": namespace} def text(self, data): """Generates SpaceCharacters and Characters tokens Depending on what's in the data, this generates one or more ``SpaceCharacters`` and ``Characters`` tokens. For example: >>> from html5lib.treewalkers.base import TreeWalker >>> # Give it an empty tree just so it instantiates >>> walker = TreeWalker([]) >>> list(walker.text('')) [] >>> list(walker.text(' ')) [{u'data': ' ', u'type': u'SpaceCharacters'}] >>> list(walker.text(' abc ')) # doctest: +NORMALIZE_WHITESPACE [{u'data': ' ', u'type': u'SpaceCharacters'}, {u'data': u'abc', u'type': u'Characters'}, {u'data': u' ', u'type': u'SpaceCharacters'}] :arg data: the text data :returns: one or more ``SpaceCharacters`` and ``Characters`` tokens """ data = data middle = data.lstrip(spaceCharacters) left = data[:len(data) - len(middle)] if left: yield {"type": "SpaceCharacters", "data": left} data = middle middle = data.rstrip(spaceCharacters) right = data[len(middle):] if middle: yield {"type": "Characters", "data": middle} if right: yield {"type": "SpaceCharacters", "data": right} def comment(self, data): """Generates a Comment token :arg data: the comment :returns: Comment token """ return {"type": "Comment", "data": data} def doctype(self, name, publicId=None, systemId=None): """Generates a Doctype token :arg name: :arg publicId: :arg systemId: :returns: the Doctype token """ return {"type": "Doctype", "name": name, "publicId": publicId, "systemId": systemId} def entity(self, name): """Generates an Entity token :arg name: the entity name :returns: an Entity token """ return {"type": "Entity", "name": name} def unknown(self, nodeType): """Handles unknown node types""" return self.error("Unknown node type: " + nodeType) class NonRecursiveTreeWalker(TreeWalker): def getNodeDetails(self, node): raise NotImplementedError def getFirstChild(self, node): raise NotImplementedError def getNextSibling(self, node): raise NotImplementedError def getParentNode(self, node): raise NotImplementedError def __iter__(self): currentNode = self.tree while currentNode is not None: details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] hasChildren = False if type == DOCTYPE: yield self.doctype(*details) elif type == TEXT: for token in self.text(*details): yield token elif type == ELEMENT: namespace, name, attributes, hasChildren = details if (not namespace or namespace == namespaces["html"]) and name in voidElements: for token in self.emptyTag(namespace, name, attributes, hasChildren): yield token hasChildren = False else: yield self.startTag(namespace, name, attributes) elif type == COMMENT: yield self.comment(details[0]) elif type == ENTITY: yield self.entity(details[0]) elif type == DOCUMENT: hasChildren = True else: yield self.unknown(details[0]) if hasChildren: firstChild = self.getFirstChild(currentNode) else: firstChild = None if firstChild is not None: currentNode = firstChild else: while currentNode is not None: details = self.getNodeDetails(currentNode) type, details = details[0], details[1:] if type == ELEMENT: namespace, name, attributes, hasChildren = details if (namespace and namespace != namespaces["html"]) or name not in voidElements: yield self.endTag(namespace, name) if self.tree is currentNode: currentNode = None break nextSibling = self.getNextSibling(currentNode) if nextSibling is not None: currentNode = nextSibling break else: currentNode = self.getParentNode(currentNode) etree_lxml.py 0000644 00000014325 15030125115 0007254 0 ustar 00 from __future__ import absolute_import, division, unicode_literals from pip._vendor.six import text_type from collections import OrderedDict from lxml import etree from ..treebuilders.etree import tag_regexp from . import base from .. import _ihatexml def ensure_str(s): if s is None: return None elif isinstance(s, text_type): return s else: return s.decode("ascii", "strict") class Root(object): def __init__(self, et): self.elementtree = et self.children = [] try: if et.docinfo.internalDTD: self.children.append(Doctype(self, ensure_str(et.docinfo.root_name), ensure_str(et.docinfo.public_id), ensure_str(et.docinfo.system_url))) except AttributeError: pass try: node = et.getroot() except AttributeError: node = et while node.getprevious() is not None: node = node.getprevious() while node is not None: self.children.append(node) node = node.getnext() self.text = None self.tail = None def __getitem__(self, key): return self.children[key] def getnext(self): return None def __len__(self): return 1 class Doctype(object): def __init__(self, root_node, name, public_id, system_id): self.root_node = root_node self.name = name self.public_id = public_id self.system_id = system_id self.text = None self.tail = None def getnext(self): return self.root_node.children[1] class FragmentRoot(Root): def __init__(self, children): self.children = [FragmentWrapper(self, child) for child in children] self.text = self.tail = None def getnext(self): return None class FragmentWrapper(object): def __init__(self, fragment_root, obj): self.root_node = fragment_root self.obj = obj if hasattr(self.obj, 'text'): self.text = ensure_str(self.obj.text) else: self.text = None if hasattr(self.obj, 'tail'): self.tail = ensure_str(self.obj.tail) else: self.tail = None def __getattr__(self, name): return getattr(self.obj, name) def getnext(self): siblings = self.root_node.children idx = siblings.index(self) if idx < len(siblings) - 1: return siblings[idx + 1] else: return None def __getitem__(self, key): return self.obj[key] def __bool__(self): return bool(self.obj) def getparent(self): return None def __str__(self): return str(self.obj) def __unicode__(self): return str(self.obj) def __len__(self): return len(self.obj) class TreeWalker(base.NonRecursiveTreeWalker): def __init__(self, tree): # pylint:disable=redefined-variable-type if isinstance(tree, list): self.fragmentChildren = set(tree) tree = FragmentRoot(tree) else: self.fragmentChildren = set() tree = Root(tree) base.NonRecursiveTreeWalker.__init__(self, tree) self.filter = _ihatexml.InfosetFilter() def getNodeDetails(self, node): if isinstance(node, tuple): # Text node node, key = node assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key return base.TEXT, ensure_str(getattr(node, key)) elif isinstance(node, Root): return (base.DOCUMENT,) elif isinstance(node, Doctype): return base.DOCTYPE, node.name, node.public_id, node.system_id elif isinstance(node, FragmentWrapper) and not hasattr(node, "tag"): return base.TEXT, ensure_str(node.obj) elif node.tag == etree.Comment: return base.COMMENT, ensure_str(node.text) elif node.tag == etree.Entity: return base.ENTITY, ensure_str(node.text)[1:-1] # strip &; else: # This is assumed to be an ordinary element match = tag_regexp.match(ensure_str(node.tag)) if match: namespace, tag = match.groups() else: namespace = None tag = ensure_str(node.tag) attrs = OrderedDict() for name, value in list(node.attrib.items()): name = ensure_str(name) value = ensure_str(value) match = tag_regexp.match(name) if match: attrs[(match.group(1), match.group(2))] = value else: attrs[(None, name)] = value return (base.ELEMENT, namespace, self.filter.fromXmlName(tag), attrs, len(node) > 0 or node.text) def getFirstChild(self, node): assert not isinstance(node, tuple), "Text nodes have no children" assert len(node) or node.text, "Node has no children" if node.text: return (node, "text") else: return node[0] def getNextSibling(self, node): if isinstance(node, tuple): # Text node node, key = node assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": # XXX: we cannot use a "bool(node) and node[0] or None" construct here # because node[0] might evaluate to False if it has no child element if len(node): return node[0] else: return None else: # tail return node.getnext() return (node, "tail") if node.tail else node.getnext() def getParentNode(self, node): if isinstance(node, tuple): # Text node node, key = node assert key in ("text", "tail"), "Text nodes are text or tail, found %s" % key if key == "text": return node # else: fallback to "normal" processing elif node in self.fragmentChildren: return None return node.getparent() etree.py 0000644 00000010707 15030125115 0006220 0 ustar 00 from __future__ import absolute_import, division, unicode_literals from collections import OrderedDict import re from pip._vendor.six import string_types from . import base from .._utils import moduleFactoryFactory tag_regexp = re.compile("{([^}]*)}(.*)") def getETreeBuilder(ElementTreeImplementation): ElementTree = ElementTreeImplementation ElementTreeCommentType = ElementTree.Comment("asd").tag class TreeWalker(base.NonRecursiveTreeWalker): # pylint:disable=unused-variable """Given the particular ElementTree representation, this implementation, to avoid using recursion, returns "nodes" as tuples with the following content: 1. The current element 2. The index of the element relative to its parent 3. A stack of ancestor elements 4. A flag "text", "tail" or None to indicate if the current node is a text node; either the text or tail of the current element (1) """ def getNodeDetails(self, node): if isinstance(node, tuple): # It might be the root Element elt, _, _, flag = node if flag in ("text", "tail"): return base.TEXT, getattr(elt, flag) else: node = elt if not(hasattr(node, "tag")): node = node.getroot() if node.tag in ("DOCUMENT_ROOT", "DOCUMENT_FRAGMENT"): return (base.DOCUMENT,) elif node.tag == "<!DOCTYPE>": return (base.DOCTYPE, node.text, node.get("publicId"), node.get("systemId")) elif node.tag == ElementTreeCommentType: return base.COMMENT, node.text else: assert isinstance(node.tag, string_types), type(node.tag) # This is assumed to be an ordinary element match = tag_regexp.match(node.tag) if match: namespace, tag = match.groups() else: namespace = None tag = node.tag attrs = OrderedDict() for name, value in list(node.attrib.items()): match = tag_regexp.match(name) if match: attrs[(match.group(1), match.group(2))] = value else: attrs[(None, name)] = value return (base.ELEMENT, namespace, tag, attrs, len(node) or node.text) def getFirstChild(self, node): if isinstance(node, tuple): element, key, parents, flag = node else: element, key, parents, flag = node, None, [], None if flag in ("text", "tail"): return None else: if element.text: return element, key, parents, "text" elif len(element): parents.append(element) return element[0], 0, parents, None else: return None def getNextSibling(self, node): if isinstance(node, tuple): element, key, parents, flag = node else: return None if flag == "text": if len(element): parents.append(element) return element[0], 0, parents, None else: return None else: if element.tail and flag != "tail": return element, key, parents, "tail" elif key < len(parents[-1]) - 1: return parents[-1][key + 1], key + 1, parents, None else: return None def getParentNode(self, node): if isinstance(node, tuple): element, key, parents, flag = node else: return None if flag == "text": if not parents: return element else: return element, key, parents, None else: parent = parents.pop() if not parents: return parent else: assert list(parents[-1]).count(parent) == 1 return parent, list(parents[-1]).index(parent), parents, None return locals() getETreeModule = moduleFactoryFactory(getETreeBuilder) dom.py 0000644 00000002605 15030125115 0005671 0 ustar 00 from __future__ import absolute_import, division, unicode_literals from xml.dom import Node from . import base class TreeWalker(base.NonRecursiveTreeWalker): def getNodeDetails(self, node): if node.nodeType == Node.DOCUMENT_TYPE_NODE: return base.DOCTYPE, node.name, node.publicId, node.systemId elif node.nodeType in (Node.TEXT_NODE, Node.CDATA_SECTION_NODE): return base.TEXT, node.nodeValue elif node.nodeType == Node.ELEMENT_NODE: attrs = {} for attr in list(node.attributes.keys()): attr = node.getAttributeNode(attr) if attr.namespaceURI: attrs[(attr.namespaceURI, attr.localName)] = attr.value else: attrs[(None, attr.name)] = attr.value return (base.ELEMENT, node.namespaceURI, node.nodeName, attrs, node.hasChildNodes()) elif node.nodeType == Node.COMMENT_NODE: return base.COMMENT, node.nodeValue elif node.nodeType in (Node.DOCUMENT_NODE, Node.DOCUMENT_FRAGMENT_NODE): return (base.DOCUMENT,) else: return base.UNKNOWN, node.nodeType def getFirstChild(self, node): return node.firstChild def getNextSibling(self, node): return node.nextSibling def getParentNode(self, node): return node.parentNode __pycache__/__init__.cpython-310.pyc 0000644 00000007552 15030125115 0013216 0 ustar 00 o �7]hW � @ sZ d Z ddlmZmZmZ ddlmZ ddlmZ ddgZ i Z d d d�Zd d� Zdd� Z dS )a� A collection of modules for iterating through different kinds of tree, generating tokens identical to those produced by the tokenizer module. To create a tree walker for a new type of tree, you need to implement a tree walker object (called TreeWalker by convention) that implements a 'serialize' method which takes a tree as sole argument and returns an iterator which generates tokens. � )�absolute_import�division�unicode_literals� )� constants)� default_etree� getTreeWalker�pprintNc K s� | � � } | tvrR| dkrddlm} |jt| <