diff --git a/README.md b/README.md index 476ffed..8c3e8f1 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,6 @@ # Skyscraper Parser -Simple python parser for getting skyscraper problems from the Caribou context website into a format which can be used in machine learning tasks \ No newline at end of file +Simple python parser for getting skyscraper problems from the Caribou context website into a format which can be used in machine learning tasks + +requires `bs4` and `playwright` please install these using a relevant package manager or pip + +run with `python3 parser.py [--board_size (2-26)] [--out_file (path/to/file)]` \ No newline at end of file diff --git a/parser.py b/parser.py index f92f5da..349d0cb 100644 --- a/parser.py +++ b/parser.py @@ -1,63 +1,62 @@ from bs4 import BeautifulSoup -import subprocess -from selenium import webdriver -from requests_html import HTMLSession -# from seleniumrequests import Firefox -import time +from playwright.sync_api import sync_playwright +import argparse URL = "https://cariboutests.com/games/skyscrapers.php" - def process_header_tr(tr): - for td in tr.find_all('td')[1:-1]: - print (f"header number: {td.div.span.text}") + return [int(td.text) for td in tr.find_all("td")[1:-1]] def process_middle_tr(tr): tds = tr.find_all('td') left = tds[0].div.span.text right = tds[-1].div.span.text - print (f"left: {left}, right: {right}") + return [int(left), int(right)] def process_page(source): soup = BeautifulSoup(source, 'html.parser') problem = soup.find(id='sky') table = problem.table.tbody trs = table.find_all('tr') + results = [] for index,tr in enumerate(trs): if index == 0: - process_header_tr(tr) + results.append(process_header_tr(tr)) elif index == len(trs)-1: - process_header_tr(tr) + results.append(process_header_tr(tr)) else: - process_middle_tr(tr) + results.append(process_middle_tr(tr)) + return results -def download_page(): - subprocess.run(["curl", "-X", "POST", "--user-agent", "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0", "-d", "board_size: 8", "-o", "skyscrapers.php", URL]) - with open("skyscrapers.php", "r+") as file: - content = file.read() - # subprocess.run(["rm", "skyscrapers.php"]) - print(content) - return content - # options = webdriver.ChromeOptions() - # options.add_argument('--headless') - # driver = webdriver.Chrome(options=options) +def download_page(board_size): + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + context = browser.new_context() + page = context.new_page() + page.goto(URL, wait_until="load") + page.select_option("select", value=str(board_size)) + page.click("input[type=submit]") + page.wait_for_load_state("load") + src = page.content() + browser.close() + return src - # session = HTMLSession() - # r = session.post(url=URL, data={"board_size": "6"}, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0"}) - # r.html.render() - # print(r.html.find(id='sky')) - # print(r.text) +def run(): + parser = argparse.ArgumentParser(description='Download sky scraper problems') - # print(driver.page_source) + parser.add_argument('--board_size', '-b', type=int, help='Size of board to download', default=6) + parser.add_argument("--out_file", "-o", help="File to write the problem to", default="skyscraper_problem.txt") - # post(driver, "6") - # driver.get(URL) + args = parser.parse_args() - # src = driver.page_source - # print(src) - # driver.quit() - - # return r.text + with open(args.out_file, "w") as file: + file.write("BOARD_SIZE:\t" + str(args.board_size) + '\n') + file.write('\n') + results = process_page(download_page(args.board_size)) + for result in results: + for data in result: + file.write(str(data) + "\t") + file.write('\n') if __name__ == "__main__": - process_page(download_page()) \ No newline at end of file + run() \ No newline at end of file diff --git a/test.txt b/test.txt deleted file mode 100644 index 9e96acf..0000000 --- a/test.txt +++ /dev/null @@ -1,565 +0,0 @@ -/usr/bin/python3.11 /home/brett/Documents/code/python/skyscraper_parser/parser.py -[INFO] Starting Chromium download. -100%|██████████| 183M/183M [00:23<00:00, 7.65Mb/s] -[INFO] Beginning extraction -[INFO] Chromium extracted to: /home/brett/.local/share/pyppeteer/local-chromium/1181205 -Traceback (most recent call last): - File "/home/brett/Documents/code/python/skyscraper_parser/parser.py", line 55, in - process_page(download_page()) - ^^^^^^^^^^^^^^^ - File "/home/brett/Documents/code/python/skyscraper_parser/parser.py", line 48, in download_page - src = driver.page_source - ^^^^^^ -NameError: name 'driver' is not defined. Did you mean: 'webdriver'? - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Skyscrapers © - - - - - - - - - - - - - - - - - - - - - - -
-
-
-
- Flag -
-
-
-
- -
-
- -
- -
- - - -
-
- - - - - - - - -
-
- - - -
-
-
-

300000

-
Board Size
English | Français | فارسی | 中文 | Українська | Azerbaijani | ខ្មែរ | Tiếng Việt | Bahasa Melayu | Deutsch | O'zbek | Русский -
-

Skyscrapers©

-
- Total number of plays: 611085 -
-
- - - - -
611085

- -
-

-


- -

- - - -
- - - - - - -
-
- -
- - - -
- - - - - - - - - - - - - - - - - - - -Process finished with exit code 1