scripts/scraping/scraper.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61

import requests
from urllib import parse
import json
from bs4 import BeautifulSoup

def extractInstructions(instructions):
    returnedInstructions = []
    for inst in instructions:

        # If there are sub sections, parse them
        if "itemListElement" in inst:
            print("hitting sub")
            subObj = {}
            subInst = []
            subObj["name"] = inst["name"]
            for el in inst["itemListElement"]:
                subInst.append(el["text"])
            subObj["subInstructions"] = subInst
            returnedInstructions.append(subObj)

        # Else we have a shallower array of instructions
        else:
            returnedInstructions.append(inst["text"])

    return returnedInstructions
                

def scrape(url, user_name):
    try:
        data = requests.get(url, headers= {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"})    
        html = BeautifulSoup(data.text, 'html.parser')
        inner_html = html.find('script', class_='yoast-schema-graph')
        json_data = json.loads(inner_html.contents[0])
        graph_data = json_data["@graph"]
        # print(graph_data, "graph data")
        for i in graph_data:
            if(i["@type"] == "Recipe"):
                recipe = {}
                instructions = extractInstructions(i["recipeInstructions"])
                keywords_list = i["keywords"].split(",")
                tags = i["recipeCuisine"] + keywords_list
                cleaned_tags = list(set([tag.strip().lower() for tag in tags]))
                slug = parse.quote(i["name"]).lower()

                # The recipe
                recipe["user"] = user_name
                recipe["slug"] = slug
                recipe["title"] = i["name"]
                recipe["image"] = i["image"][0]
                recipe["url"] = i["mainEntityOfPage"]
                recipe["tags"] = cleaned_tags
                recipe["ingredients"] = i["recipeIngredient"]
                recipe["instructions"] = instructions
                recipe["visible_by"] = ["jez"]
                # recipe["encoded_url"] = urllib.parse.quote(i["name"])

        return {"success": True, "data": recipe}

    except Exception as e: 
        print(str(e))
        return {"success": False, "error": "couldn't scrape site metadata"}