aboutsummaryrefslogtreecommitdiff
path: root/scripts/scraping/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/scraping/scraper.py')
-rw-r--r--scripts/scraping/scraper.py35
1 files changed, 35 insertions, 0 deletions
diff --git a/scripts/scraping/scraper.py b/scripts/scraping/scraper.py
new file mode 100644
index 0000000..50b46ae
--- /dev/null
+++ b/scripts/scraping/scraper.py
@@ -0,0 +1,35 @@
+import requests
+from urllib import parse
+import json
+from bs4 import BeautifulSoup
+
+def scrape(url, user_name):
+ data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})
+ html = BeautifulSoup(data.text, 'html.parser')
+ inner_html = html.find('script', class_='yoast-schema-graph')
+ json_data = json.loads(inner_html.contents[0])
+ graph_data = json_data["@graph"]
+ for i in graph_data:
+ if(i["@type"] == "Recipe"):
+ recipe = {}
+ instructions = []
+ for instruction in i["recipeInstructions"]:
+ instructions.append(instruction["text"])
+ keywords_list = i["keywords"].split(",")
+ tags = i["recipeCuisine"] + keywords_list
+ cleaned_tags = list(set([tag.strip().lower() for tag in tags]))
+ slug = parse.quote(i["name"]).lower()
+
+ # The recipe
+ recipe["user"] = user_name
+ recipe["slug"] = slug
+ recipe["title"] = i["name"]
+ recipe["image"] = i["image"][0]
+ recipe["url"] = i["mainEntityOfPage"]
+ recipe["tags"] = cleaned_tags
+ recipe["ingredients"] = i["recipeIngredient"]
+ recipe["instructions"] = instructions
+ recipe["visible_by"] = ["jez"]
+ # recipe["encoded_url"] = urllib.parse.quote(i["name"])
+ # Complete this all later!!
+ return recipe