From 4bab6f1fa6ecdad059f12659aefb1475ff4f226b Mon Sep 17 00:00:00 2001 From: Brett Laptop Date: Wed, 15 Jan 2025 17:08:57 -0500 Subject: [PATCH] this doesn't work lol --- README.md | 2 + parser.py | 63 ++ skyscrapers.php | 2196 +++++++++++++++++++++++++++++++++++++++++++++++ test.txt | 565 ++++++++++++ 4 files changed, 2826 insertions(+) create mode 100644 README.md create mode 100644 parser.py create mode 100644 skyscrapers.php create mode 100644 test.txt diff --git a/README.md b/README.md new file mode 100644 index 0000000..476ffed --- /dev/null +++ b/README.md @@ -0,0 +1,2 @@ +# Skyscraper Parser +Simple python parser for getting skyscraper problems from the Caribou context website into a format which can be used in machine learning tasks \ No newline at end of file diff --git a/parser.py b/parser.py new file mode 100644 index 0000000..f92f5da --- /dev/null +++ b/parser.py @@ -0,0 +1,63 @@ +from bs4 import BeautifulSoup +import subprocess +from selenium import webdriver +from requests_html import HTMLSession +# from seleniumrequests import Firefox +import time + +URL = "https://cariboutests.com/games/skyscrapers.php" + + +def process_header_tr(tr): + for td in tr.find_all('td')[1:-1]: + print (f"header number: {td.div.span.text}") + +def process_middle_tr(tr): + tds = tr.find_all('td') + left = tds[0].div.span.text + right = tds[-1].div.span.text + print (f"left: {left}, right: {right}") + +def process_page(source): + soup = BeautifulSoup(source, 'html.parser') + problem = soup.find(id='sky') + table = problem.table.tbody + trs = table.find_all('tr') + for index,tr in enumerate(trs): + if index == 0: + process_header_tr(tr) + elif index == len(trs)-1: + process_header_tr(tr) + else: + process_middle_tr(tr) + +def download_page(): + subprocess.run(["curl", "-X", "POST", "--user-agent", "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0", "-d", "board_size: 8", "-o", "skyscrapers.php", URL]) + with open("skyscrapers.php", "r+") as file: + content = file.read() + # subprocess.run(["rm", "skyscrapers.php"]) + print(content) + return content + # options = webdriver.ChromeOptions() + # options.add_argument('--headless') + # driver = webdriver.Chrome(options=options) + + # session = HTMLSession() + # r = session.post(url=URL, data={"board_size": "6"}, headers={"User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:131.0) Gecko/20100101 Firefox/131.0"}) + # r.html.render() + # print(r.html.find(id='sky')) + # print(r.text) + + # print(driver.page_source) + + # post(driver, "6") + # driver.get(URL) + + # src = driver.page_source + # print(src) + # driver.quit() + + # return r.text + +if __name__ == "__main__": + process_page(download_page()) \ No newline at end of file diff --git a/skyscrapers.php b/skyscrapers.php new file mode 100644 index 0000000..9cada64 --- /dev/null +++ b/skyscrapers.php @@ -0,0 +1,2196 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skyscrapers © + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ Flag +
+
+
+
+ +
+
+ +
+ +
+ + + +
+
+ + + + + + + + +
+
+ + + +
+
+
+

300000

+
Board Size
English | Français | فارسی | 中文 | Українська | Azerbaijani | ខ្មែរ | Tiếng Việt | Bahasa Melayu | Deutsch | O'zbek | Русский +
+

Skyscrapers©

+
+ Total number of plays: 611085 +
+
+ + + + +
611085

+ +
+

+


+ +

+ + + +
+ + + + + + +
+
+ +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skyscrapers © + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ Flag +
+
+
+
+ +
+
+ +
+ +
+ + + +
+
+ + + + + + + + +
+
+ + + +
+
+
+

300000

+
Board Size
English | Français | فارسی | 中文 | Українська | Azerbaijani | ខ្មែរ | Tiếng Việt | Bahasa Melayu | Deutsch | O'zbek | Русский +
+

Skyscrapers©

+
+ Total number of plays: 611085 +
+
+ + + + +
611085

+ +
+

+


+ +

+ + + +
+ + + + + + +
+
+ +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skyscrapers © + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ Flag +
+
+
+
+ +
+
+ +
+ +
+ + + +
+
+ + + + + + + + +
+
+ + + +
+
+
+

300000

+
Board Size
English | Français | فارسی | 中文 | Українська | Azerbaijani | ខ្មែរ | Tiếng Việt | Bahasa Melayu | Deutsch | O'zbek | Русский +
+

Skyscrapers©

+
+ Total number of plays: 611085 +
+
+ + + + +
611085

+ +
+

+


+ +

+ + + +
+ + + + + + +
+
+ +
+ + + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skyscrapers © + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ Flag +
+
+
+
+ +
+
+ +
+ +
+ + + +
+
+ + + + + + + + +
+
+ + + +
+
+
+

300000

+
Board Size
English | Français | فارسی | 中文 | Українська | Azerbaijani | ខ្មែរ | Tiếng Việt | Bahasa Melayu | Deutsch | O'zbek | Русский +
+

Skyscrapers©

+
+ Total number of plays: 611085 +
+
+ + + + +
611085

+ +
+

+


+ +

+ + + +
+ + + + + + +
+
+ +
+ + + +
+ + + + + + + + + + + + + + + + + diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..9e96acf --- /dev/null +++ b/test.txt @@ -0,0 +1,565 @@ +/usr/bin/python3.11 /home/brett/Documents/code/python/skyscraper_parser/parser.py +[INFO] Starting Chromium download. +100%|██████████| 183M/183M [00:23<00:00, 7.65Mb/s] +[INFO] Beginning extraction +[INFO] Chromium extracted to: /home/brett/.local/share/pyppeteer/local-chromium/1181205 +Traceback (most recent call last): + File "/home/brett/Documents/code/python/skyscraper_parser/parser.py", line 55, in + process_page(download_page()) + ^^^^^^^^^^^^^^^ + File "/home/brett/Documents/code/python/skyscraper_parser/parser.py", line 48, in download_page + src = driver.page_source + ^^^^^^ +NameError: name 'driver' is not defined. Did you mean: 'webdriver'? + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + Skyscrapers © + + + + + + + + + + + + + + + + + + + + + + +
+
+
+
+ Flag +
+
+
+
+ +
+
+ +
+ +
+ + + +
+
+ + + + + + + + +
+
+ + + +
+
+
+

300000

+
Board Size
English | Français | فارسی | 中文 | Українська | Azerbaijani | ខ្មែរ | Tiếng Việt | Bahasa Melayu | Deutsch | O'zbek | Русский +
+

Skyscrapers©

+
+ Total number of plays: 611085 +
+
+ + + + +
611085

+ +
+

+


+ +

+ + + +
+ + + + + + +
+
+ +
+ + + +
+ + + + + + + + + + + + + + + + + + + +Process finished with exit code 1