This is my inaugural project. I set out to
- Get a https URL with python
- Scrape data
- Export data to CSV
To request the URL I started with inbuilt urllib
which I quickly found to be quite limiting. This resulted in settling for installing additional modules requests
and BeautifulSoup4
.
I used CSV module to export the parsed data to a csv file.
import requests
from bs4 import BeautifulSoup
import csv
myurl= 'https://en.wikipedia.org/wiki/Programming_languages_used_in_most_popular_websites'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36'}
# Send a GET request to the URL
r = requests.get(myurl,headers)
# Check if the request was successful
if r.status_code == 200:
# Parse the HTML content of the webpage
openedPage = BeautifulSoup(r.content, 'html.parser')
# Open a CSV file to write the table data
with open('table_data.csv', mode='w', newline='', encoding='utf-8') as file:
writer = csv.writer(file)
# Find all tables in the webpage
tables = openedPage.find_all('table')
for table in tables:
# Find all rows in the table
rows = table.find_all('tr')
for row in rows:
# Find all columns (cells) in the row
cols = row.find_all(['td', 'th'])
# Extract the text from each column
col_data = [col.get_text(strip=True) for col in cols]
writer.writerow(col_data)
else:
print('Failed to fetch the page')
Completed