Links: PROGRAMMING - PYTHON
Rel: python 3rd party packages
Ref: docs, lxml
Tags: #public
Beautiful Soup
parsing HTML (for web scraping)
pip install BeautifulSoup4
pip install lxml # need a parser {: id="need-a-parser" }
import bs4
from bs4 import BeautifulSoup
title = soup.title.text
simplescrape.py
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('[https://www.crummy.com/software/BeautifulSoup/').read()](https://www.crummy.com/software/BeautifulSoup/').read())
soup = bs.BeautifulSoup(sauce, 'lxml')
# print(soup.title) {: id="print(soup.title)" }
# print(soup.title.text) {: id="print(soup.title.text)" }
# print(soup.find_all('p')) #find all paragraph tags {: id="print(soup.find-all('p'))-#find-all-paragraph-tags" }
# for paragraph in soup.find_all('p'): {: id="for-paragraph-in-soup.find-all('p'):" }
# # print(paragraph.string) #works with a navigitable string (vs when there are child tags) {: id="#-print(paragraph.string)-#works-with-a-navigitable-string-(vs-when-there-are-child-tags)" }
# print(paragraph.text) {: id="print(paragraph.text)" }
# print(soup.get_text()) {: id="print(soup.get-text())" }
for url in soup.find_all('a'):
print(url.get('href'))
navigatingtags.py
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('[https://pythonprogramming.net/').read()](https://pythonprogramming.net/').read())
soup = bs.BeautifulSoup(sauce, 'lxml')
nav = soup.nav
# print(nav) {: id="print(nav)" }
# for url in nav.find_all('a'): {: id="for-url-in-nav.find-all('a'):" }
# print(url.get('href')) {: id="print(url.get('href'))" }
# body = soup.body {: id="body-=-soup.body" }
# for paragraph in body.find_all('p'): {: id="for-paragraph-in-body.find-all('p'):" }
# print(paragraph.text) {: id="print(paragraph.text)" }
for div in soup.find_all('div', class_='body'):
print(div.text)
navigatingtables.py
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('[https://pythonprogramming.net/parsememcparseface/').read()](https://pythonprogramming.net/parsememcparseface/').read())
soup = bs.BeautifulSoup(sauce, 'lxml')
# # # table = soup.table #identical to below {: id="#-#-table-=-soup.table-#identical-to-below" }
table = soup.find('table')
# print(table) {: id="print(table)" }
table_rows = table.find_all('tr')
for tr in table_rows:
td = tr.find_all('td')
row = [i.text for i in td]
print(row)
#BETTER TO USE PANDAS
"""
import pandas as pd
dfs = pd.read_html('URL', header=0)
for df in dfs:
print(df)
"""
navigatingxml.py
import bs4 as bs
import urllib.request
sauce = urllib.request.urlopen('[https://pythonprogramming.net/sitemap.xml').read()](https://pythonprogramming.net/sitemap.xml').read())
soup = bs.BeautifulSoup(sauce, 'xml') #vs lxml
# print(soup) {: id="print(soup)" }
for url in soup.find_all('loc'):
print(url.text)
Dynamic Javascript Scraping (Sentdex) yt :
https://pythonprogramming.net/javascript-dynamic-scraping-parsing-beautiful-soup-tutorial/
References:
- https://miningthedetails.com/blog/python/BeautifulSoupWebScraping/
-