Secure your code as it's written. Use Snyk Code to scan source code in minutes - no build needed - and fix issues immediately.
return elements if self.many else elements[0]
if elements:
results = [self._parse_element(element) for element in elements]
elif self.default is None:
raise NothingMatchedError(
f"Extract `{self.css_select or self.xpath_select}` error, "
f"please check selector or set parameter named `default`"
)
else:
results = [self.default]
return results if self.many else results[0]
class AttrField(_LxmlElementField):
"""
This field is used to get attribute.
"""
def __init__(
self,
attr,
css_select: str = None,
xpath_select: str = None,
default="",
many: bool = False,
):
super(AttrField, self).__init__(
css_select=css_select, xpath_select=xpath_select, default=default, many=many
)
self.attr = attr
attr,
css_select: str = None,
xpath_select: str = None,
default="",
many: bool = False,
):
super(AttrField, self).__init__(
css_select=css_select, xpath_select=xpath_select, default=default, many=many
)
self.attr = attr
def _parse_element(self, element):
return element.get(self.attr, self.default)
class HtmlField(_LxmlElementField):
"""
This field is used to get raw html data.
"""
def _parse_element(self, element):
return etree.tostring(element, encoding="utf-8").decode(encoding="utf-8")
class TextField(_LxmlElementField):
"""
This field is used to get text.
"""
def _parse_element(self, element):
strings = [node.strip() for node in element.itertext()]
string = "".join(strings)
self.attr = attr
def _parse_element(self, element):
return element.get(self.attr, self.default)
class HtmlField(_LxmlElementField):
"""
This field is used to get raw html data.
"""
def _parse_element(self, element):
return etree.tostring(element, encoding="utf-8").decode(encoding="utf-8")
class TextField(_LxmlElementField):
"""
This field is used to get text.
"""
def _parse_element(self, element):
strings = [node.strip() for node in element.itertext()]
string = "".join(strings)
return string if string else self.default
class RegexField(BaseField):
"""
This field is used to get raw html code by regular expression.
RegexField uses standard library `re` inner, that is to say it has a better performance than _LxmlElementField.
"""
def __init__(
self,
css_select: str = None,
xpath_select: str = None,
default=None,
many: bool = False,
):
"""
:param css_select: css select http://lxml.de/cssselect.html
:param xpath_select: http://www.w3school.com.cn/xpath/index.asp
:param default: inherit
:param many: inherit
"""
super(_LxmlElementField, self).__init__(default=default, many=many)
self.css_select = css_select
self.xpath_select = xpath_select