Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
def _parse_element(self, element):
return etree.tostring(element, encoding="utf-8").decode(encoding="utf-8")
class TextField(_LxmlElementField):
"""
This field is used to get text.
"""
def _parse_element(self, element):
strings = [node.strip() for node in element.itertext()]
string = "".join(strings)
return string if string else self.default
class RegexField(BaseField):
"""
This field is used to get raw html code by regular expression.
RegexField uses standard library `re` inner, that is to say it has a better performance than _LxmlElementField.
"""
def __init__(self, re_select: str, re_flags=0, default="", many: bool = False):
super(RegexField, self).__init__(default=default, many=many)
self._re_select = re_select
self._re_object = re.compile(self._re_select, flags=re_flags)
def _parse_match(self, match):
"""
If there is a group dict, return the dict;
even if there's only one value in the dict, return a dictionary;
If there is a group in match, return the group;
if there is only one value in the group, return the value;
def __new__(cls, name, bases, attrs):
__fields = dict(
{
(field_name, attrs.pop(field_name))
for field_name, object in list(attrs.items())
if isinstance(object, BaseField)
}
)
attrs["__fields"] = __fields
new_class = type.__new__(cls, name, bases, attrs)
return new_class
def __init__(self, default="", many: bool = False):
"""
Init BaseField class
url: http://lxml.de/index.html
:param default: default value
:param many: if there are many fields in one page
"""
self.default = default
self.many = many
def extract(self, *args, **kwargs):
raise NotImplementedError("extract is not implemented.")
class _LxmlElementField(BaseField):
def __init__(
self,
css_select: str = None,
xpath_select: str = None,
default=None,
many: bool = False,
):
"""
:param css_select: css select http://lxml.de/cssselect.html
:param xpath_select: http://www.w3school.com.cn/xpath/index.asp
:param default: inherit
:param many: inherit
"""
super(_LxmlElementField, self).__init__(default=default, many=many)
self.css_select = css_select
self.xpath_select = xpath_select