python - Comment trouver tag avec texte particulier avec Beautiful Soup

Mots clés : pythonhtmlweb-scrapingbeautifulsouppython

meilleur 5 Réponses python - Comment trouver tag avec texte particulier avec Beautiful Soup

vote vote

98

import BeautifulSoup import re  columns = soup.findAll('td', text = re.compile('your regex here'), attrs = {'class' : 'pos'}) 
vote vote

89

curl https://gist.githubusercontent.com/RichardBronosky/4060082/raw/test.py | python 
# Taken from https://gist.github.com/4060082 from BeautifulSoup import BeautifulSoup from urllib2 import urlopen from pprint import pprint import re  soup = BeautifulSoup(urlopen('https://gist.githubusercontent.com/RichardBronosky/4060082/raw/test.html').read()) # I'm going to assume that Peter knew that re.compile is meant to cache a computation result for a performance benefit. However, I'm going to do that explicitly here to be very clear. pattern = re.compile('Fixed text')  # Peter's suggestion here returns a list of what appear to be strings columns = soup.findAll('td', text=pattern, attrs={'class' : 'pos'}) # ...but it is actually a BeautifulSoup.NavigableString print type(columns[0]) #>> <class 'BeautifulSoup.NavigableString'>  # you can reach the tag using one of the convenience attributes seen here pprint(columns[0].__dict__) #>> {'next': <br />, #>>  'nextSibling': <br />, #>>  'parent': <td class="pos">\n #>>       "Fixed text:"\n #>>       <br />\n #>>       <strong>text I am looking for</strong>\n #>>   </td>, #>>  'previous': <td class="pos">\n #>>       "Fixed text:"\n #>>       <br />\n #>>       <strong>text I am looking for</strong>\n #>>   </td>, #>>  'previousSibling': None}  # I feel that 'parent' is safer to use than 'previous' based on http://www.crummy.com/software/BeautifulSoup/bs4/doc/#method-names # So, if you want to find the 'text' in the 'strong' element... pprint([t.parent.find('strong').text for t in soup.findAll('td', text=pattern, attrs={'class' : 'pos'})]) #>> [u'text I am looking for']  # Here is what we have learned: print soup.find('strong') #>> <strong>some value</strong> print soup.find('strong', text='some value') #>> u'some value' print soup.find('strong', text='some value').parent #>> <strong>some value</strong> print soup.find('strong', text='some value') == soup.find('strong') #>> False print soup.find('strong', text='some value') == soup.find('strong').text #>> True print soup.find('strong', text='some value').parent == soup.find('strong') #>> True 
vote vote

74

from bs4 import BeautifulSoup as bs  html = ''' <tr>   <td class="pos">\n       "Some text:"\n       <br>\n       <strong>some value</strong>\n   </td> </tr> <tr>   <td class="pos">\n       "Fixed text:"\n       <br>\n       <strong>text I am looking for</strong>\n   </td> </tr> <tr>   <td class="pos">\n       "Some other text:"\n       <br>\n       <strong>some other value</strong>\n   </td> </tr>''' soup = bs(html, 'lxml') print(soup.select_one('td:contains("Fixed text:") strong').text) 
vote vote

64

soup.find_all("td", string="Elsie") 
vote vote

59

from bs4 import BeautifulSoup from urllib.request import urlopen,Request from urllib.parse import urljoin,urlparse  rawLinks=soup.findAll('a',href=True) for link in rawLinks:     innercontent=link.text     if keyword.lower() in innercontent.lower():         print(link) 

Questions similaires