��瑕�

Beautiful Soup ��涓�涓���浠ヤ� HTML �� XML �煎���浠朵腑�����版���� Python 搴�锛�浠���浠ュ�HTML �� XML �版��瑙f��涓�Python 瀵硅薄锛�浠ユ�逛究��杩�Python浠g��杩�琛�澶�����

��妗g��澧�

Centos Python BeautifulSoup

  • ����妗d腑浠g����娴�璇���澧�
  • Beautifu Soup 浣跨�ㄨ�存��

    Beautiful Soup ���烘�����藉氨��瀵�HTML����绛捐�琛��ユ�惧��缂�杈���

    �烘��姒�蹇�-瀵硅薄绫诲��

    Beautiful Soup 广大�澶��� HTML ��妗h浆�㈡��涓�涓�澶�������褰㈢���锛�姣�涓����归�借�潘��㈡��涓�涓�Python 瀵硅薄锛�Beautiful Soup广大�杩�河渠�瀵硅薄瀹�涔�河渠�4 绉�绫诲��: Tag��NavigableString��BeautifulSoup��Comment ��

    瀵硅薄绫诲�� ��杩�
    BeautifulSoup
    ��妗g���ㄩ�ㄥ��疏浚�
    Tag
    HTML����绛�
    NavigableString
    ��绛惧��������海洋�
    Comment
    ��涓�绉��规����NavigableString绫诲��锛�褰���绛句腑��NavigableString 琚�娉ㄩ���讹���疏浚�涔�涓鸿�ョ把��

    疏浚�瑁���寮���

    # Beautiful Soup
    pip install bs4
    
    # 瑙f����
    pip install lxml
    pip install html5lib
    
    # ��彭���
    from bs4 import BeautifulSoup
    
    # �规�涓�锛��存�ユ��寮���浠�
    soup = BeautifulSoup(open("index.html"))
    
    # �规�河渠�锛���疏浚��版��
    resp = "<html>data</html>"
    soup = BeautifulSoup(resp, 'lxml')
    
    # soup 涓� BeautifulSoup 绫诲��瀵硅薄
    print(type(soup))
    

    ��绛炬��绱㈠��杩�婊�

    �烘���规�

    ��绛炬��绱㈡��find_all() ��find() 涓や锅�烘������绱㈡�规�锛�find_all() �规�浼�杩��������归���抽��海洋�����绛惧��琛�锛�find()�规�����杩���涓�涓��归��缁�����

    soup = BeautifulSoup(resp, 'lxml')
    
    # 杩���涓�涓���绛惧��涓�"a"��Tag
    soup.find("a")
    
    # 杩�������tag ��琛�
    soup.find_all("a")
    
    ## find_all�规���琚�绠���
    soup("a")
    
    #�惧�烘����浠�b寮�澶寸����绛�
    for tag in soup.find_all(re.compile("^b")):
        print(tag.name)
    
    #�惧�哄��琛ㄤ腑��������绛�
    soup.find_all(["a", "p"])
    
    # �ユ�炬��绛惧��涓�p锛�class灞��т负"title"
    soup.find_all("p", "title")
    
    # �ユ�惧���id涓�"link2"
    soup.find_all(id="link2")
    
    # �ユ�惧���ㄥ���id��
    soup.find_all(id=True)
    
    #
    soup.find_all(href=re.compile("elsie"), id='link1')
    
    # 
    soup.find_all(attrs={"data-foo": "value"})
    
    #�ユ�炬��绛炬��海洋�����"sisters"
    soup.find(string=re.compile("sisters"))
    
    # �山����疏浚��伴����缁���
    soup.find_all("a", limit=2)
    
    # ��疏浚�涔��归���规�
    def has_class_but_no_id(tag):
        return tag.has_attr('class') and not tag.has_attr('id')
    soup.find_all(has_class_but_no_id)
    
    # 浠�瀵瑰��т娇�ㄨ��疏浚�涔��归���规�
    def not_lacie(href):
            return href and not re.compile("lacie").search(href)
    soup.find_all(href=not_lacie)
    
    # 璋���tag�� find_all() �规���,Beautiful Soup浼�妫�绱㈠���tag������海洋�海洋�����,濡������虫��绱�tag���存�ュ������,��浠ヤ娇�ㄥ���� recursive=False 
    
    soup.find_all("title", recursive=False)
    

    �╁��规�

    find_parents()
    �����惰�����
    find_parent()
    绗�涓�涓��惰�����
    find_next_siblings()
    涔�����������寮�����
    find_next_sibling()
    涔�����绗�涓�涓���寮�����
    find_previous_siblings()
    涔�����������寮�����
    find_previous_sibling()
    涔�����绗�涓�涓���寮�����
    find_all_next()
    涔�����������绱�
    find_next()
    涔�����绗�涓�涓���绱�
    find_all_previous()
    涔�����������绱�
    find_previous()
    涔�����绗�涓�涓���绱�

    CSS���╁��

    Beautiful Soup����澶ч�ㄥ����CSS���╁�� http://www.w3.org/TR/CSS2/selector.html, �� Tag �� BeautifulSoup 瀵硅薄�� .select() �规�涓�浼��ュ��绗�涓插����, �冲��浣跨��CSS���╁�ㄧ��璇�娉��惧��tag��

    html_doc = """
    <html>
    <head>
      <title>The Dormouse's story</title>
    </head>
    <body>
      <p class="title"><b>The Dormouse's story</b></p>
    
      <p class="story">
        Once upon a time there were three little sisters; and their names were
        <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
        <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
        and
        <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
        and they lived at the bottom of a well.
      </p>
    
      <p class="story">...</p>
    """
    
    soup = BeautifulSoup(html_doc)
    
    # ���� a ��绛�
    soup.select("a")
    
    # ��灞��ユ��
    soup.select("body a")
    soup.select("html head title")
    
    # tag��绛句����存�ュ����绛�
    soup.select("head > title")
    soup.select("p > #link1")
    
    # �����归����绛句�������寮���绛�
    soup.select("#link1 ~ .sister")
    
    # �归����绛句�����绗�涓�涓���寮���绛�
    soup.select("#link1 + .sister")
    
    # �规��calss绫诲��
    soup.select(".sister")
    soup.select("[class~=sister]")
    
    # �规��ID�ユ��
    soup.select("#link1")
    soup.select("a#link1")
    
    # �规��澶�涓�ID�ユ��
    soup.select("#link1,#link2")
    
    # �规��灞��ф�ユ��
    soup.select('a[href]')
    
    # �规��灞��у�兼�ユ��
    soup.select('a[href^="http://example.com/"]')
    soup.select('a[href$="tillie"]')
    soup.select('a[href*=".com/el"]')
    
    # ���山��涓�涓��归��缁���
    soup.select(".sister", limit=1)
    
    # ���山��涓�涓��归��缁���
    soup.select_one(".sister")
    

    ��绛惧�硅薄�规�

    ��绛惧���

    soup = BeautifulSoup('<p class="body strikeout" id="1">Extremely bold</p><p class="body strikeout" id="2">Extremely bold2</p>')
    # �山�������� p��绛惧�硅薄
    tags = soup.find_all("p")
    # �山��绗�涓�涓�p��绛惧�硅薄
    tag = soup.p
    # 杈��烘��绛剧把�� 
    type(tag)
    # ��绛惧��
    tag.name
    # ��绛惧���
    tag.attrs
    # ��绛惧���class ����
    tag['class']
    # ��绛惧��������海洋���疏浚癸�瀵硅薄NavigableString ����疏浚�
    tag.string
    
    # 杩�����绛惧����������海洋���疏浚�
    for string in tag.strings:
        print(repr(string))
    
    # 杩�����绛惧����������海洋���疏浚�, 骞跺�绘��绌鸿�
    for string in tag.stripped_strings:
        print(repr(string))
    
    # �山����tag涓�����������������海洋�海洋�tag涓���NavigableString��疏浚癸�骞朵互Unicode海洋�绗�涓叉�煎�杈���
    tag.get_text()
    ## 浠�"|"����
    tag.get_text("|")
    ## 浠�"|"����锛�涓�杈��雾┖海洋�绗�
    tag.get_text("|", strip=True)
    

    �山��海洋�����

    tag.contents  # 杩���绗�涓�灞�海洋����圭����琛�
    tag.children  # 杩���绗�涓�灞�海洋����圭��listiterator 瀵硅薄
    for child in tag.children:
        print(child)
    
    tag.descendants # ��褰�杩�������海洋�����
    for child in tag.descendants:
        print(child)
    

    �山���惰����

    tag.parent # 杩���绗�涓�灞��惰���规��绛�
    tag.parents # ��褰�寰��板��绱��������惰�����
    
    for parent in tag.parents:
        if parent is None:
            print(parent)
        else:
            print(parent.name)
    

    �山����寮�����

    # 涓�涓�涓���寮���绱�
    tag.next_sibling 
    
    # 褰�����绛句�����������寮���绱�
    tag.next_siblings
    for sibling in tag.next_siblings:
        print(repr(sibling))
    
    # 涓�涓�涓���寮���绱�
    tag.previous_sibling
    
    # 褰�����绛句�����������寮���绱�
    tag.previous_siblings
    for sibling in tag.previous_siblings:
        print(repr(sibling))
    

    ��绱�������

    Beautiful Soup涓���姣�涓�tag疏浚�涔�涓轰�涓���element��锛�姣�涓���element��锛�琚���涓���涓�����HTML涓�����锛���浠ラ��杩������戒护��涓��剧ず��绛�

    # 褰�����绛剧��涓�涓�涓���绱�
    tag.next_element
    
    # 褰�����绛句�����������绱�
    for element in tag.next_elements:
        print(repr(element))
    
    # 褰�����绛剧����涓�涓���绱�
    tag.previous_element
    # 褰�����绛句�����������绱�
    for element in tag.previous_elements:
        print(repr(element))
    

    淇��规��绛惧���

    soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
    tag = soup.b
    
    tag.name = "blockquote"
    tag['class'] = 'verybold'
    tag['id'] = 1
    
    tag.string = "New link text."
    print(tag)
    

    淇��规��绛惧��疏浚癸�NavigableString)

    soup = BeautifulSoup('<b class="boldest">Extremely bold</b>')
    tag = soup.b
    tag.string = "New link text."

    娣诲����绛惧��疏浚癸�NavigableString)

    soup = BeautifulSoup("<a>Foo</a>")
    tag = soup.a
    tag.append("Bar")
    tag.contents
    
    # ����
    
    new_string = NavigableString("Bar")
    tag.append(new_string)
    print(tag)
    

    娣诲��娉ㄩ��(Comment)

    娉ㄩ����涓�涓��规����NavigableString 瀵硅薄锛���浠ュ���山��浠ラ��杩�append() �规�杩�琛�娣诲����

    from bs4 import Comment
    soup = BeautifulSoup("<a>Foo</a>")
    new_comment = soup.new_string("Nice to see you.", Comment)
    tag.append(new_comment)
    print(tag)

    娣诲����绛�(Tag)

    娣诲����绛炬�规���涓ょ�锛�涓�绉����ㄦ��疏浚���绛剧�����ㄦ房��锛�append�规�锛�锛���涓�绉����ㄦ��疏浚�浣�缃�娣诲��(insert��insert_before��insert_after�规�)

  • append�规�
    
    soup = BeautifulSoup("<b></b>")
    tag = soup.b
    new_tag = soup.new_tag("a", href="http://www.example.com")
    new_tag.string = "Link text."
    tag.append(new_tag)
    print(tag)
  • * insert�规�锛������ㄥ�����绛惧�����瑰��琛ㄧ����疏浚�浣�缃����ュ�硅薄锛�Tag��NavigableString锛�
    ```python
    html = '<b><a href="http://example.com/">I linked to <i>example.com</i></a></b>'
    soup = BeautifulSoup(html)
    tag = soup.a
    tag.contents
    tag.insert(1, "but did not endorse ")
    tag.contents
  • insert_before() �� insert_after() �规����ㄥ�����绛句�����涔�������寮����规坊����绱�
    
    html = '<b><a href="http://example.com/">I linked to <i>example.com</i></a></b>'
    soup = BeautifulSoup(html)
    tag = soup.new_tag("i")
    tag.string = "Don't"
    soup.b.insert_before(tag)
    soup.b
  • * wrap() �� unwrap()��浠ュ�规��疏浚���tag��绱�杩�琛���瑁���瑙e��,骞惰�����瑁�����缁�����
    
    ```python
    # 娣诲����瑁�
    soup = BeautifulSoup("<p>I wish I was bold.</p>")
    soup.p.string.wrap(soup.new_tag("b"))
    #杈��� <b>I wish I was bold.</b>
    
    soup.p.wrap(soup.new_tag("div"))
    #杈��� <div><p><b>I wish I was bold.</b></p></div>
    
    # ��瑙e��瑁�
    markup = '<a href="http://example.com/">I linked to <i>example.com</i></a>'
    soup = BeautifulSoup(markup)
    a_tag = soup.a
    
    a_tag.i.unwrap()
    a_tag
    #杈��� <a href="http://example.com/">I linked to example.com</a>

    ���ゆ��绛�

    html = '<b><a href="http://example.com/">I linked to <i>example.com</i></a></b>'
    soup = BeautifulSoup(html)
    # 娓�妤�褰�����绛剧������海洋�����
    soup.b.clear()
    
    # 广大�褰�����绛惧������海洋����逛�soup 涓�绉婚��,杩���褰�����绛俱��
    b_tag=soup.b.extract()
    b_tag
    soup
    
    # 广大�褰�����绛惧������海洋����逛�soup 涓�绉婚�わ���杩�����
    soup.b.decompose()
    
    # 广大�褰�����绛炬�挎��涓烘��疏浚�����绱�
    tag=soup.i
    new_tag = soup.new_tag("p")
    new_tag.string = "Don't"
    tag.replace_with(new_tag)
    

    �朵��规�

    杈���

    # �煎���杈���
    tag.prettify()
    tag.prettify("latin-1")
    
  • 浣跨��Beautiful Soup瑙f����,��妗i�借�潘��㈡��河渠�Unicode锛��规��海洋�绗�涔�琚�潘���涓�Unicode锛�濡���广大���妗h浆�㈡��海洋�绗�涓�,Unicode缂���浼�琚�缂�����UTF-8.杩��山氨��娉�姝g‘�剧ずHTML�规��海洋�绗�河渠�
  • 浣跨��Unicode��,Beautiful Soup杩�浼��鸿�界������寮��封��潘��㈡��HTML��XML涓����规��海洋�绗�
  • ��妗g���

    浣跨��Beautiful Soup瑙f����,��妗i�借�潘��㈡��河渠�Unicode锛��朵娇�ㄤ���缂������ㄦ�娴���海洋�搴��ヨ����褰�����妗g���骞惰浆�㈡��Unicode缂�����

    soup = BeautifulSoup(html)
    soup.original_encoding
    
    # 涔���浠ユ���ㄦ��疏浚���妗g��缂��� 
    soup = BeautifulSoup(html, from_encoding="iso-8859-8")
    soup.original_encoding
    
    # 涓烘��楂���缂������ㄦ�娴�����妫�娴�����锛�涔���浠ラ������や�河渠�缂���
    soup = BeautifulSoup(markup, exclude_encodings=["ISO-8859-7"])
    
  • ��杩�Beautiful Soup杈��烘��妗f��,涓�绠¤��ユ��妗f��浠�涔�缂����瑰�,榛�璁よ��雾�����涓�UTF-8缂���
  • ��妗hВ����

    Beautiful Soup��������, ��lxml��, ��html5lib��, �� ��html.parser��

    soup=BeautifulSoup("<a><b /></a>")
    soup
    #杈��虎� <html><body><a><b></b></a></body></html>
    soup=BeautifulSoup("<a></p>", "lxml")
    soup
    #杈��虎� <html><body><a></a></body></html>
    soup=BeautifulSoup("<a></p>", "html5lib")
    soup
    #杈��虎� <html><head></head><body><a><p></p></a></body></html>
    soup=BeautifulSoup("<a></p>", "html.parser")
    soup
    #杈��虎� <a></a>
    

    ������妗�

    https://www.crummy.com/software/BeautifulSoup/bs4/doc.zh