import requests from urllib import parse import json from bs4 import BeautifulSoup def extractInstructions(instructions): returnedInstructions = [] for inst in instructions: # If there are sub sections, parse them if "itemListElement" in inst: print("hitting sub") subObj = {} subInst = [] subObj["name"] = inst["name"] for el in inst["itemListElement"]: subInst.append(el["text"]) subObj["subInstructions"] = subInst returnedInstructions.append(subObj) # Else we have a shallower array of instructions else: returnedInstructions.append(inst["text"]) return returnedInstructions def scrape(url): try: data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"}) html = BeautifulSoup(data.text, 'html.parser') inner_html = html.find('script', class_='yoast-schema-graph') json_data = json.loads(inner_html.contents[0]) graph_data = json_data["@graph"] # print(graph_data, "graph data") for i in graph_data: if(i["@type"] == "Recipe"): recipe = {} instructions = extractInstructions(i["recipeInstructions"]) keywords_list = i["keywords"].split(",") tags = i["recipeCuisine"] + keywords_list cleaned_tags = list(set([tag.strip().lower() for tag in tags])) slug = parse.quote(i["name"]).lower() # The recipe recipe["slug"] = slug recipe["title"] = i["name"] recipe["image"] = i["image"][0] recipe["url"] = i["mainEntityOfPage"] recipe["tags"] = cleaned_tags recipe["ingredients"] = i["recipeIngredient"] recipe["instructions"] = instructions recipe["visible_by"] = ["jez"] # recipe["encoded_url"] = urllib.parse.quote(i["name"]) return {"success": True, "data": recipe} except Exception as e: print(str(e)) return {"success": False, "error": "couldn't scrape site metadata"}