aboutsummaryrefslogtreecommitdiff
path: root/scripts/scraping/scraper.py
diff options
context:
space:
mode:
Diffstat (limited to 'scripts/scraping/scraper.py')
-rw-r--r--scripts/scraping/scraper.py84
1 files changed, 55 insertions, 29 deletions
diff --git a/scripts/scraping/scraper.py b/scripts/scraping/scraper.py
index 50b46ae..8919d46 100644
--- a/scripts/scraping/scraper.py
+++ b/scripts/scraping/scraper.py
@@ -3,33 +3,59 @@ from urllib import parse
import json
from bs4 import BeautifulSoup
+def extractInstructions(instructions):
+ returnedInstructions = []
+ for inst in instructions:
+
+ # If there are sub sections, parse them
+ if "itemListElement" in inst:
+ print("hitting sub")
+ subObj = {}
+ subInst = []
+ subObj["name"] = inst["name"]
+ for el in inst["itemListElement"]:
+ subInst.append(el["text"])
+ subObj["subInstructions"] = subInst
+ returnedInstructions.append(subObj)
+
+ # Else we have a shallower array of instructions
+ else:
+ returnedInstructions.append(inst["text"])
+
+ return returnedInstructions
+
+
def scrape(url, user_name):
- data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})
- html = BeautifulSoup(data.text, 'html.parser')
- inner_html = html.find('script', class_='yoast-schema-graph')
- json_data = json.loads(inner_html.contents[0])
- graph_data = json_data["@graph"]
- for i in graph_data:
- if(i["@type"] == "Recipe"):
- recipe = {}
- instructions = []
- for instruction in i["recipeInstructions"]:
- instructions.append(instruction["text"])
- keywords_list = i["keywords"].split(",")
- tags = i["recipeCuisine"] + keywords_list
- cleaned_tags = list(set([tag.strip().lower() for tag in tags]))
- slug = parse.quote(i["name"]).lower()
-
- # The recipe
- recipe["user"] = user_name
- recipe["slug"] = slug
- recipe["title"] = i["name"]
- recipe["image"] = i["image"][0]
- recipe["url"] = i["mainEntityOfPage"]
- recipe["tags"] = cleaned_tags
- recipe["ingredients"] = i["recipeIngredient"]
- recipe["instructions"] = instructions
- recipe["visible_by"] = ["jez"]
- # recipe["encoded_url"] = urllib.parse.quote(i["name"])
- # Complete this all later!!
- return recipe
+ try:
+ data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})
+ html = BeautifulSoup(data.text, 'html.parser')
+ inner_html = html.find('script', class_='yoast-schema-graph')
+ json_data = json.loads(inner_html.contents[0])
+ graph_data = json_data["@graph"]
+ # print(graph_data, "graph data")
+ for i in graph_data:
+ if(i["@type"] == "Recipe"):
+ recipe = {}
+ instructions = extractInstructions(i["recipeInstructions"])
+ keywords_list = i["keywords"].split(",")
+ tags = i["recipeCuisine"] + keywords_list
+ cleaned_tags = list(set([tag.strip().lower() for tag in tags]))
+ slug = parse.quote(i["name"]).lower()
+
+ # The recipe
+ recipe["user"] = user_name
+ recipe["slug"] = slug
+ recipe["title"] = i["name"]
+ recipe["image"] = i["image"][0]
+ recipe["url"] = i["mainEntityOfPage"]
+ recipe["tags"] = cleaned_tags
+ recipe["ingredients"] = i["recipeIngredient"]
+ recipe["instructions"] = instructions
+ recipe["visible_by"] = ["jez"]
+ # recipe["encoded_url"] = urllib.parse.quote(i["name"])
+
+ return {"success": True, "data": recipe}
+
+ except Exception as e:
+ print(str(e))
+ return {"success": False, "error": "couldn't scrape site metadata"}