-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathurl_scrape_final.py
More file actions
68 lines (43 loc) · 1.52 KB
/
url_scrape_final.py
File metadata and controls
68 lines (43 loc) · 1.52 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from bs4 import BeautifulSoup
import csv
import requests
from time import sleep
r = requests.get("http://www.famousquotesandauthors.com/quotes_by_topic.html")
data = r.content
soup = BeautifulSoup(data,'html5lib')
#finddiv = soup.find('table',attrs = {"cellpadding:0; cellspacing:0; align:center; width:95%"})
finddiv = soup.find_all('td', attrs={'width':'36%'})
urllinks =[]
for links in finddiv:
l = links.find_all("a")
urls = [x.get("href") for x in l]
for i in range(len(urls)):
urllinks.append(urls[i])
print len(urls)
finddiv2 = soup.find_all('td', attrs={'xwidth':'33%'})
for links2 in finddiv2:
l2 = links2.find_all("a")
urls2 = [x2.get("href") for x2 in l2]
for j in range(len(urls)):
urllinks.append(urls[j])
print len(urls)
print "total_no_of_urls :", len(urllinks)
URL2 = "http://www.famousquotesandauthors.com"
quotes=[] # a list to store quotes
csvfile = "quotes.csv"
curr =743
for ul in urllinks[743:]:
r2 = requests.get(URL2 + ul)
soup2 = BeautifulSoup(r2.content, 'html5lib')
winner = [div.string for div in soup2.find_all('div', style = {"font-size:12px;font-family:Arial;"})]
print "Done topic" , curr,"/861"
#for k in range(len(winner)):
#quotes.append(winner[k])
print "moving to csv"
with open(csvfile, "a") as output:
writer = csv.writer(output, lineterminator='\n')
for val in winner:
writer.writerow([val])
curr+=1
sleep(1)
#print "ready to move to csv file now !!"