skyscraper_parser/parser.py

63 lines
1.8 KiB
Python
Raw Normal View History

2025-01-15 17:08:57 -05:00
from bs4 import BeautifulSoup
import subprocess
from selenium import webdriver
from requests_html import HTMLSession
# from seleniumrequests import Firefox
import time
URL = "https://cariboutests.com/games/skyscrapers.php"
def process_header_tr(tr):
for td in tr.find_all('td')[1:-1]:
print (f"header number: {td.div.span.text}")
def process_middle_tr(tr):
tds = tr.find_all('td')
left = tds[0].div.span.text
right = tds[-1].div.span.text
print (f"left: {left}, right: {right}")
def process_page(source):
soup = BeautifulSoup(source, 'html.parser')
problem = soup.find(id='sky')
table = problem.table.tbody
trs = table.find_all('tr')
for index,tr in enumerate(trs):
if index == 0:
process_header_tr(tr)
elif index == len(trs)-1:
process_header_tr(tr)
else:
process_middle_tr(tr)
def download_page():
subprocess.run(["curl", "-X", "POST", "--user-agent", "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0", "-d", "board_size: 8", "-o", "skyscrapers.php", URL])
with open("skyscrapers.php", "r+") as file:
content = file.read()
# subprocess.run(["rm", "skyscrapers.php"])
print(content)
return content
# options = webdriver.ChromeOptions()
# options.add_argument('--headless')
# driver = webdriver.Chrome(options=options)
# session = HTMLSession()
# r = session.post(url=URL, data={"board_size": "6"}, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0"})
# r.html.render()
# print(r.html.find(id='sky'))
# print(r.text)
# print(driver.page_source)
# post(driver, "6")
# driver.get(URL)
# src = driver.page_source
# print(src)
# driver.quit()
# return r.text
if __name__ == "__main__":
process_page(download_page())