跳转至

subparser.py

baiduspider.parser.subparser ⚓︎

WebSubParser ⚓︎

网页搜索子解析模块。

此模块为BaiduSpider.search_web函数的子模块,用于解析网页搜索子模块的HTML代码 并返回Python字典。

parse_baike_block(self, baike) ⚓︎

解析百科子块

Parameters:

Name Type Description Default
baike BeautifulSoup

从源HTML代码中提取的百科块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_baike_block(self, baike: BeautifulSoup) -> dict:
    """解析百科子块

    Args:
        baike (BeautifulSoup): 从源HTML代码中提取的百科块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if baike:
        b_title = self._format(baike.find("h3").text)
        b_url = baike.find("a")["href"]
        b_des = self._format(
            baike.find("div", class_="c-span-last")
            .find("div", class_="c-font-normal")
            .text
        )
        try:
            b_cover = baike.find("div", class_="c-span3").find("img")["src"]
            b_cover_type = "image"
        except (TypeError, AttributeError):
            try:
                b_cover = (
                    baike.find("div", class_="op-bk-polysemy-imgWrap")
                    .find("div", class_="c-img")["style"]
                    .split("url", 1)[-1]
                    .split(")", 1)[0]
                    .strip("(")
                )
                b_cover_type = "video"
            except (TypeError):
                b_cover = None
                b_cover_type = None
        baike = {
            "title": b_title,
            "url": b_url,
            "des": b_des,
            "cover": b_cover,
            "cover-type": b_cover_type,
        }
    return baike

parse_blog_block(self, blog) ⚓︎

解析博客子块

Parameters:

Name Type Description Default
blog BeautifulSoup

从源HTML代码中提取的博客块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_blog_block(self, blog: BeautifulSoup) -> dict:
    """解析博客子块

    Args:
        blog (BeautifulSoup): 从源HTML代码中提取的博客块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if blog is not None:
        blog = blog.find("section")
        b_title = blog.find("h3", class_="c-title").text
        b_url = blog.find("a")["href"]
        b_blogs_ = blog.findAll("div", class_="c-row")
        b_blogs = []
        for b in b_blogs_:
            b_current_blog_header = b.find("div")
            b_blog_title = b_current_blog_header.find("a").text
            b_blog_url = b_current_blog_header.find("a")["href"]
            b_blog_origin = b_current_blog_header.find(
                "span", class_="nor-src-wrap"
            ).text
            try:
                b_current_blog_tags = b.findAll("div")[1].findAll("span")
                b_blog_tags = [tag.text for tag in b_current_blog_tags]
            except IndexError:
                b_blog_tags = []
            b_blog_parent = b.find_parent("div").findAll("div")
            b_blog_des = None
            for p in b_blog_parent:
                if p["class"][0].startswith("blog-summary"):
                    b_blog_des = p.text
                    break
            b_blogs.append(
                {
                    "title": b_blog_title,
                    "url": b_blog_url,
                    "origin": b_blog_origin,
                    "tags": b_blog_tags,
                    "des": b_blog_des,
                }
            )
        blog = {"title": b_title, "url": b_url, "blogs": b_blogs}
    return blog

parse_gitee_block(self, gitee) ⚓︎

解析Gitee仓库子块

Parameters:

Name Type Description Default
gitee BeautifulSoup

从源HTML代码中提取的码云仓库块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_gitee_block(self, gitee: BeautifulSoup) -> dict:
    """解析Gitee仓库子块

    Args:
        gitee (BeautifulSoup): 从源HTML代码中提取的码云仓库块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if gitee is not None:
        g_title = gitee.find("h3", class_="c-title").text
        g_url = gitee.find("a", class_="c-blocka")["href"]
        gitee = gitee.find("section").find("div", class_="c-tabs-content-wrapper")
        g_tabs = gitee.findAll("div", class_="c-tabs-content")
        g_intro = g_tabs[0].find("div", class_="c-tabs-item").find("div")
        g_des = g_intro.find("div").text
        g_license = (
            g_intro.findAll("div")[1].text.lstrip("开源协议:")
            if len(g_intro.findAll("div")) >= 2
            and g_intro.findAll("div")[1].text.startswith("开源协议:")
            else None
        )
        g_lang = (
            g_intro.findAll("div")[2].text.lstrip("开发语言:")
            if len(g_intro.findAll("div")) >= 2
            and g_intro.findAll("div")[1].text.startswith("开发语言:")
            or len(g_intro.findAll("div")) >= 3
            and g_intro.findAll("div")[2].text.startswith("开发语言:")
            else None
        )
        g_temp = g_intro.findAll("span")
        g_star = int(g_temp[0].text.strip("Star:"))
        g_fork = int(g_temp[1].text.strip("Fork:"))
        g_watch = int(g_temp[2].text.strip("Watch:"))
        g_status = g_tabs[-1].find("img")["src"]
        gitee = {
            "title": g_title,
            "url": g_url,
            "des": g_des,
            "license": g_license,
            "lang": g_lang,
            "star": g_star,
            "fork": g_fork,
            "watch": g_watch,
            "status": g_status,
        }
    return gitee

parse_music_block(self, music) ⚓︎

解析音乐子块

Parameters:

Name Type Description Default
music BeautifulSoup

从源HTML代码中提取的音乐块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_music_block(self, music: BeautifulSoup) -> dict:
    """解析音乐子块

    Args:
        music (BeautifulSoup): 从源HTML代码中提取的音乐块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if music is not None:
        # 从注释中获取结果JSON
        music = json.loads(
            music.find(text=lambda text: isinstance(text, Comment)).strip(
                "s-data: "
            )
        )
        m_title = music["title"].replace("<em>", "").replace("</em>", "")  # 搜索结果标题
        m_url = music["url"]  # 搜索结果链接
        m_songs = []  # 搜索结果歌曲
        for song in music["data"]["site"]:
            # 歌手信息
            s_singer = [
                {"url": i["singerUrl"], "name": i["singerName"]}
                for i in song["singer"]
            ]
            # 预处理歌曲发布时间
            try:
                __ = song["publishTime"].split("-")
            except KeyError:
                __ = None
            # 预处理歌曲时长
            _ = int(song["duration"])
            # 歌曲信息
            s_song = {
                "name": song["displaySongName"],  # 歌曲名称
                "url": song["songUrl"],  # 歌曲链接
                "poster": song["poster"],  # 歌曲海报图片链接
                "is_original": bool(int(song["isOriginal"])),  # 是否为原唱
                "pub_date": datetime(int(__[0]), int(__[1]), int(__[2]))
                if __ is not None
                else None,  # 歌曲发布时间
                "labels": [i["txt"] for i in song["labels"]],  # 歌曲标签
                "copyright": bool(int(song["copyRight"])),  # 歌曲是否有版权
                "site": song["sitePinyin"],  # 歌曲发布站点(拼音)
                "duration": time(
                    hour=int(_ / 60 / 60), minute=int(_ / 60), second=int(_ % 60)
                ),  # 歌曲时长
                "other_sites": song["allWapPlayFile"],  # 歌曲其他网站链接
            }
            # 歌曲发布公司
            try:
                s_song["pub_company"] = song["pubCompany"]
                if song["pubCompany"] == "null":
                    s_song["pub_company"] = None
            except KeyError:
                s_song["pub_company"] = None
            # 歌曲专辑
            try:
                s_album = {
                    "url": song["album"]["albumUrl"],
                    "name": song["album"]["albumName"],
                }
            except KeyError:
                s_album = None
            m_songs.append({"song": s_song, "singer": s_singer, "album": s_album})
        music = {"title": m_title, "url": m_url, "songs": m_songs}
    return music

parse_news_block(self, news) ⚓︎

解析资讯子块

Parameters:

Name Type Description Default
news BeautifulSoup

从源HTML代码中提取的资讯块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_news_block(self, news: BeautifulSoup) -> dict:
    """解析资讯子块

    Args:
        news (BeautifulSoup): 从源HTML代码中提取的资讯块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    try:
        self._format(news.find("h3", class_="t").find("a").text)
    except:
        news_detail = []
    else:
        news_rows = news.findAll("div", class_="c-row")
        news_detail = []
        prev_row = {}
        for row in news_rows:
            try:
                row_title = self._format(row.find("a").text)
            except AttributeError:
                prev_row["des"] = self._format(row.text)
                continue
            row_time = self._format(row.find("span", class_="c-color-gray2").text)
            row_author = self._format(row.find("span", class_="c-color-gray").text)
            row_url = self._format(row.find("a")["href"])
            news_detail.append(
                {
                    "title": row_title,
                    "time": row_time,
                    "author": row_author,
                    "url": row_url,
                    "des": None,
                }
            )
            prev_row = news_detail[-1]
    return news_detail

parse_tieba_block(self, tieba) ⚓︎

解析贴吧子块

Parameters:

Name Type Description Default
tieba BeautifulSoup

从源HTML代码中提取的贴吧块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_tieba_block(self, tieba: BeautifulSoup) -> dict:
    """解析贴吧子块

    Args:
        tieba (BeautifulSoup): 从源HTML代码中提取的贴吧块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if tieba:
        t_title = self._format(tieba.find("h3").text)
        t_url = tieba["mu"]
        try:
            t_info_ = tieba.find(
                "div", class_="op-tieba-general-col-top-xs"
            ).findAll("p")
            t_des = self._format(t_info_[0].text)
        except AttributeError:
            t_des = None
        t_followers = self._format(
            tieba.find("div", class_="c-span-last").find("span").find("span").text
        )
        t_total = self._format(
            tieba.find("div", class_="c-span-last").findAll("span")[-1].text
        )
        try:
            t_cover = tieba.find("a", class_="op-tieba-general-photo-link").find(
                "img"
            )["src"]
        except AttributeError:
            t_cover = None
        t_hot_ = tieba.findAll("div", class_="c-row")[1:-1]
        t_hot = []
        i = 1
        for hot in t_hot_:
            t_h_title = self._format(hot.find("a").text)
            t_h_url = hot.find("a")["href"]
            t_h_clicks = self._format(
                hot.find("div", class_="c-color-gray2").find("span").text
            )
            t_h_replies = self._format(
                hot.findAll("div", class_="c-color-gray2")[-1].find("span").text
            )
            t_hot.append(
                {
                    "title": t_h_title,
                    "url": t_h_url,
                    "clicks": t_h_clicks,
                    "replies": t_h_replies,
                }
            )
            i += 1
        del i
        tieba = {
            "title": t_title,
            "url": t_url,
            "des": t_des,
            "followers": t_followers,
            "total": t_total,
            "cover": t_cover,
            "hot": t_hot,
        }
    return tieba

parse_video_block(self, video) ⚓︎

解析视频子块

Parameters:

Name Type Description Default
video BeautifulSoup

从源HTML代码中提取的视频块BeautifulSoup对象

required

Returns:

Type Description
dict

dict: 解析后自动生成的Python结果字典对象

Source code in baiduspider\parser\subparser.py
@handle_err
def parse_video_block(self, video: BeautifulSoup) -> dict:
    """解析视频子块

    Args:
        video (BeautifulSoup): 从源HTML代码中提取的视频块BeautifulSoup对象

    Returns:
        dict: 解析后自动生成的Python结果字典对象
    """
    if video:
        video_rows = video.findAll("div", class_="c-row")
        video_results = []
        for row in video_rows:
            row_res = []
            videos = row.findAll("div", class_="c-span3")
            for v in videos:
                v_link = v.find("a")
                v_title = v_link["title"]
                v_url = self._format(v_link["href"])
                v_img = v_link.find("img")["src"]
                v_len = self._format(
                    v.find("div", class_="op-short-video-pc-duration-wrap-new").text
                )
                v_from = self._format(
                    v.find("div", class_="op-short-video-pc-clamp1").text
                )
                row_res.append(
                    {
                        "title": v_title,
                        "url": v_url,
                        "cover": v_img,
                        "length": v_len,
                        "origin": v_from,
                    }
                )
            video_results += row_res
    else:
        video_results = []
    return video_results

评论

Back to top