Amazon.it-scraper/scraper4py.py at main · Pixel2075/Amazon.it-scraper · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from bs4 import BeautifulSoup
import requests
from requests_html import HTMLSession
from scraper import get_url_desc_price,get_item,get_image2
def get_specs(url,soup):
    names = []
    specs_list = []
    specs_dict = {}
    i = 0
    try:
        specs1 = soup.find('table',{'class':'a-normal a-spacing-micro'})
        specs = specs1.find_all('tr')
        for name in specs:
            names.append(name.find('td',{'class':'a-span3'}).text)
        for spec in specs:
            specs_list.append(spec.find('td',{'class':'a-span9'}).text)
        for nam in names:
            specs_dict[nam] = specs_list[i]
            i += 1
    except Exception as e:
        #print(e)
        try:
            s = HTMLSession()
            r = s.get(url)
            r.html.render(sleep=1)
            soup1 = BeautifulSoup(r.content,'html.parser')
            specs1 = soup1.find('table',{'class':'a-normal a-spacing-micro'})
            specs = specs1.find_all('tr')
            for name in specs:
                names.append(name.find('td',{'class':'a-span3'}).text)
            for spec in specs:
                specs_list.append(spec.find('td',{'class':'a-span9'}).text)
            for nam in names:
                specs_dict[nam] = specs_list[i]
                i += 1
        except Exception as e:
            specs = 'N/A'
            #print(e)
    return specs_dict
def get_details(url,soup):
    f =0
    names2 = []
    details_list = []
    details_dict = {}
    try:
        details1 = soup.find('table',{'id':'productDetails_techSpec_section_1'})
        details = details1.find_all('tr')
        for name in details:
                names2.append(name.find('th').text)
        u = []
        for detail in details:
                u.append(detail.find('td').text.strip())
        for namesi in range(len(u)):
                details_dict[names2[namesi]] = u[namesi]
    except Exception as e:
        #print(e)
        try:
            s = HTMLSession()
            r = s.get(url)
            r.html.render(sleep=1)
            soup1 = BeautifulSoup(r.content,'html.parser')
            details1 = soup1.find('table',{'id':'productDetails_techSpec_section_1'})
            details = details1.find_all('tr')
            for name in details:
                    names2.append(name.find('th').text)
            u = []
            for detail in details:
                u.append(detail.find('td').text.strip())
            for namesi in range(len(u)):
                    details_dict[names2[namesi]] = u[namesi]
        except Exception as e:
            #print(e)
            details_dict['details'] = 'N/A'
    return details_dict
def get_image(url,soup,item):
    try:
        images = {}
        image1 = soup.find_all('div',{'id':'imgTagWrapperId'})
        for image in image1:
            founded = image.find_all('img')
            for img in founded:
                var = img['src']
                images['img_url'] = var
        splited = images['img_url'].split('/')[5]
        images['splited'] = splited
        images['non_splited'] = splited[0:-4]
    except Exception as e:
            #print(e)
            try:
                images = get_image2(item)
            except Exception as e:
                images = {}
                image = soup.find_all('img')
                for imag in image:
                    pic = imag.split('/')
                    if pic[4] == 'I':
                        images['img_url'] = imag
                        images['splited'] = pic[5]
                        images['non_splited'] = pic[5][0:-4]
                        #print(e)
                try:
                    images = {}
                    images['splited'] = 'N/A'
                    images['non_splited'] = 'N/A'
                    images['image_url'] = 'N/A'
                    #print(e)
                except:
                    print('Error While Retriving Image')
    return images

def get_asin(url):
    try:
        asin = url.split('/')[5]
    except:
        asin =  'Error'
    return asin