跳转至

subparser.py

baiduspider.mobile.parser.subparser ⚓︎

WebSubParser ⚓︎

网页搜索子解析模块。

此模块为BaiduSpider.mobile.search_web函数的子模块,用于解析网页搜索子模块的HTML代码 并返回Python字典。

parse_baike_block(self, baike) ⚓︎

解析百科子块

Parameters:

Name Type Description Default
baike BeautifulSoup

从源HTML代码中提取的百科块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\mobile\parser\subparser.py
@handle_err
def parse_baike_block(self, baike: BeautifulSoup) -> dict:
    """解析百科子块

    Args:
        baike (BeautifulSoup): 从源HTML代码中提取的百科块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if baike is None:
        return []
    b_url = json.loads(baike["data-log"])["mu"]
    baike = baike.find("article")
    title = baike.find("h3", class_="c-title").text
    baike = baike.find("section")
    try:
        poster = baike.find("img", class_="c-img-img")["src"]
    except:
        poster = None
    des = baike.find("div", class_="c-abstract").text
    section_container = baike.findAll("a", class_="c-slink")
    sections = []
    for section in section_container:
        text = section.text
        url = section["href"]
        sections.append({"text": text, "url": url})
    _ = baike.find("div", class_="c-source")
    origin = _.find("span", class_="c-color-source").text
    label_container = _.findAll("span", class_="c-label-radius")
    labels = [l.text for l in label_container]
    result = {
        "title": title,
        "poster": poster,
        "des": des,
        "sections": sections,
        "origin": origin,
        "labels": labels,
        "url": b_url,
    }
    return {"result": result}

parse_knowledge_block(self, knowledge) ⚓︎

解析相关知识子块

Parameters:

Name Type Description Default
knowledge BeautifulSoup

从源HTML代码中提取的相关知识块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\mobile\parser\subparser.py
@handle_err
def parse_knowledge_block(self, knowledge: BeautifulSoup) -> dict:
    """解析相关知识子块

    Args:
        knowledge (BeautifulSoup): 从源HTML代码中提取的相关知识块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    from pprint import pprint

    if knowledge is None:
        return []
    knowledge = knowledge.find("article")
    k_title = knowledge.find("header").find("h3").text
    knowledge_container = knowledge.findAll("div", class_="c-scroll-item")
    results = []
    for kl in knowledge_container:
        url = kl.find("a")["href"].replace("&", "&")
        image = kl.find("img")["data-lazy-src"]
        title = kl.find("div", class_="c-color-link").text
        try:
            des = kl.find("div", class_="c-color-gray").text
            if not des.strip():
                des = None
        except AttributeError:
            des = None
        results.append({"url": url, "image": image, "title": title, "des": des})
    return {"results": results, "title": k_title}

parse_reyi_block(self, reyi) ⚓︎

解析热议子块

Parameters:

Name Type Description Default
reyi BeautifulSoup

从源HTML代码中提取的热议块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\mobile\parser\subparser.py
@handle_err
def parse_reyi_block(self, reyi: BeautifulSoup) -> dict:
    """解析热议子块

    Args:
        reyi (BeautifulSoup): 从源HTML代码中提取的热议块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if reyi is None:
        return []
    reyi = reyi.find("section")
    r_url = reyi.find("a", class_="c-blocka")["href"]
    r_total = self._reformat_big_num(
        reyi.find("a", class_="middle").text.strip("条\ue734"), "全部"
    )
    post_container = reyi.findAll("div", class_="tts-b-item")
    posts = []
    for post in post_container:
        _ = post.find("div", role="text")
        author_avatar = _.find("img")["src"]
        __ = _.find("div", class_="c-line-clamp1").findAll("span")
        author = __[0].text
        site = __[1].text
        pub_time = __[-1].text
        _ = post.find("div", role="option")
        des = _.find("p", class_="text-container").text.replace("\ue60b", "")
        __ = _.findAll("img", class_="c-img-img")
        images = []
        for img in __:
            try:
                images.append(img["data-lazy-src"])
            except:
                pass
        try:
            origin = _.find("div", class_="origin-content-new").text
        except:
            origin = None
        ___ = post.findAll("i", class_=["c-icon", "c-gap-inner-right-small"])
        __ = []
        for t in ___:
            for c in t["class"]:
                if "zk-icon-size" in c:
                    __.append(t)
                    break
        __ = __[1:]
        try:
            comments = int(__[0].find_next_sibling("span").text.strip())
        except:
            comments = 0
        try:
            likes = int(__[1].find_next_sibling("span").text.strip())
        except:
            likes = 0
        posts.append(
            {
                "author_avatar": author_avatar,
                "author": author,
                "site": site,
                "pub_time": pub_time,
                "des": des,
                "images": images,
                "origin": origin,
                "comments": comments,
                "likes": likes,
            }
        )
    return {"results": posts, "url": r_url, "total": r_total}

parse_short_video_block(self, short_video) ⚓︎

解析短视频子块

Parameters:

Name Type Description Default
short_video BeautifulSoup

从源HTML代码中提取的短视频块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\mobile\parser\subparser.py
@handle_err
def parse_short_video_block(self, short_video: BeautifulSoup) -> dict:
    """解析短视频子块

    Args:
        short_video (BeautifulSoup): 从源HTML代码中提取的短视频块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if short_video is None:
        return []
    s_total = self._reformat_big_num(
        short_video.find("div", class_="middle")
        .find("span", class_="text")
        .text.strip("条\ue734"),
        "全部",
    )
    s_url = short_video.find("article")["rl-link-href"]
    short_video = short_video.find("section")
    short_video_container = short_video.findAll("div", class_="vid-pocket-item")
    results = []
    for vid in short_video_container:
        poster = vid.find("img")["data-lazy-src"]
        url = vid.parent.parent["href"]
        _ = vid.find("div", class_="c-color-white")
        try:
            author_avatar = _.find("img")["src"]
        except:
            author_avatar = None
        author = _.find("span", class_="c-color-white").text
        play_times = _.findAll("span")[-1].text
        if play_times is not None and "次播放" in play_times:
            delta = 1
            if "万" in play_times:
                delta = 10000
            elif "亿" in play_times:
                delta = 100000000
            play_times = int(
                float(
                    play_times.replace("次播放", "").replace("万", "").replace("亿", "")
                )
                * delta
            )
        else:
            play_times = None
        title = vid.find("div", class_="c-gap-bottom-small").text
        results.append(
            {
                "title": title,
                "poster": poster,
                "url": url,
                "author_avatar": author_avatar,
                "author": author,
                "play_times": play_times,
            }
        )
    return {"results": results, "total": s_total, "url": s_url}

parse_video_block(self, video) ⚓︎

解析视频子块

Parameters:

Name Type Description Default
video BeautifulSoup

从源HTML代码中提取的视频块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\mobile\parser\subparser.py
@handle_err
def parse_video_block(self, video: BeautifulSoup) -> dict:
    """解析视频子块

    Args:
        video (BeautifulSoup): 从源HTML代码中提取的视频块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if video is None:
        return []
    video = video.find("article")
    tags_container = (
        video.find("section")
        .find("div", class_="c-tags-scroll-wrapper")
        .findAll("li")
    )
    v_url = video["rl-link-href"]
    results = []
    tags = []
    for tag in tags_container:
        text = self._format(tag.text)
        url = tag.find("a")["href"]
        tags.append({"text": text, "url": url})
    vid_container = video.findAll("div", class_="c-span")
    for vid in vid_container:
        vid = vid.find("div", class_="c-touchable-feedback-content")
        url = vid.find("a")["href"]
        _ = vid.find("div", class_="c-img")
        poster = _.find("img")["src"]
        labels = []
        __ = _.find("span", class_="c-label-radius")
        if __ is not None:
            labels.append(self._format(__.text))
        duration = None
        video_num = None
        __ = _.find("div", class_="c-img-info-br")
        if "视频" in __.text:
            video_num = int(self._format(__.text.strip("个视频")))
        elif ":" in __.text:
            duration = __.text
        title = self._format(vid.find("div", role="text").text)
        _ = vid.findAll("div", class_="c-line-clamp1")[-1]
        try:
            author_avatar = _.find("img")["src"]
            __ = _.findAll("span")
            author = self._format(__[0].text)
            if len(__) > 1 and "vid-scroll-pubtime" in __[1]["class"][0]:
                pub_time = self._format(__[1].text)
            else:
                pub_time = None
            __ = _.find("span", class_="c-label-radius")
            if __ is not None:
                labels.append(__.text)
        except:
            author_avatar = None
            author = None
            pub_time = None
        results.append(
            {
                "poster": poster,
                "labels": labels,
                "duration": duration,
                "video_num": video_num,
                "title": title,
                "author_avatar": author_avatar,
                "author": author,
                "pub_time": pub_time,
                "url": url,
            }
        )
    return {"results": results, "tags": tags, "url": v_url}

评论

Back to top