skyscraper_parser/parser.py

62 lines
2.0 KiB
Python
Raw Permalink Normal View History

2025-01-15 17:08:57 -05:00
from bs4 import BeautifulSoup
2025-01-15 17:53:51 -05:00
from playwright.sync_api import sync_playwright
import argparse
2025-01-15 17:08:57 -05:00
URL = "https://cariboutests.com/games/skyscrapers.php"
def process_header_tr(tr):
2025-01-15 17:53:51 -05:00
return [int(td.text) for td in tr.find_all("td")[1:-1]]
2025-01-15 17:08:57 -05:00
def process_middle_tr(tr):
tds = tr.find_all('td')
left = tds[0].div.span.text
right = tds[-1].div.span.text
2025-01-15 17:53:51 -05:00
return [int(left), int(right)]
2025-01-15 17:08:57 -05:00
def process_page(source):
soup = BeautifulSoup(source, 'html.parser')
problem = soup.find(id='sky')
table = problem.table.tbody
trs = table.find_all('tr')
2025-01-15 17:53:51 -05:00
results = []
2025-01-15 17:08:57 -05:00
for index,tr in enumerate(trs):
if index == 0:
2025-01-15 17:53:51 -05:00
results.append(process_header_tr(tr))
2025-01-15 17:08:57 -05:00
elif index == len(trs)-1:
2025-01-15 17:53:51 -05:00
results.append(process_header_tr(tr))
2025-01-15 17:08:57 -05:00
else:
2025-01-15 17:53:51 -05:00
results.append(process_middle_tr(tr))
return results
def download_page(board_size):
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
context = browser.new_context()
page = context.new_page()
page.goto(URL, wait_until="load")
page.select_option("select", value=str(board_size))
page.click("input[type=submit]")
page.wait_for_load_state("load")
src = page.content()
browser.close()
return src
def run():
parser = argparse.ArgumentParser(description='Download sky scraper problems')
parser.add_argument('--board_size', '-b', type=int, help='Size of board to download', default=6)
parser.add_argument("--out_file", "-o", help="File to write the problem to", default="skyscraper_problem.txt")
args = parser.parse_args()
with open(args.out_file, "w") as file:
file.write("BOARD_SIZE:\t" + str(args.board_size) + '\n')
file.write('\n')
results = process_page(download_page(args.board_size))
for result in results:
for data in result:
file.write(str(data) + "\t")
file.write('\n')
2025-01-15 17:08:57 -05:00
if __name__ == "__main__":
2025-01-15 17:53:51 -05:00
run()