ZRHSpotterTool/ZRHGrabber.py at python3-wip · dxmekch/ZRHSpotterTool · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
#!/usr/bin/python3
# -*- coding: utf-8 -*-
"""
Created on Thu Jun 27 10:47:03 2019

@author: simon
"""

from datetime import datetime, timedelta
import urllib
from bs4 import BeautifulSoup
import requests

# ToDo: Tested: headers seem to be not required
# Edit: Seems to be required sometimes
# Notice:
# I tried to use the most recent headers with a super long Cookie string and a
# few additional parameters. The request was rejected.
# These Headers seem to still work fine.
headers = { 'Host': 'www.zurich-airport.com',
		'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0',
		'Accept': '*/*',
		'Accept-Language': 'en-US,en;q=0.5',
		'Accept-Encoding': 'gzip, deflate, br',
		'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
		'X-Requested-With': 'XMLHttpRequest',
		'Referer': 'https://www.zurich-airport.com/passengers-and-visitors/arrivals-and-departures/',
		'Content-Length': '85',
		'Cookie': 'sc_expview=0; website#lang=en; ASP.NET_SessionId=tewblvuqsehbozbq5yakat2n; __RequestVerificationToken=vIUKElZhB4SVQfWDXu2DyJAYluqyOAecVSwB5sDOniFyFMTvSZJnrZNudEVMbrRzvtLS1v2GurbyFTCScCMOME-ybd81; TS01cd1ab8=018735b6f7dff7a3e09286a67ae052f4e2011617ee8846a666a6630182514b91ee5552f129831d7f6267e740d9595f99ca9149393dae9ac848ae68b61d427320fffd9b5eb9890ba550276dd5bd9c357f6c979b7a89d7ebac45ad4fe23564114c99cdab08f117858652ebe1195e659af262baa071e0bbd6fc395fb0b81a1ebd6bfe2799de00bde795de96bab4b0a8b52993ab0995a2',
		'DNT': '1',
		'Connection': 'keep-alive' }

url_base = 'https://www.zurich-airport.com/api/sitecore/FlightScheduleDetail/'

class ZRHGrabber:
    def __init__(self):
        self.hdr = headers
        self.url_base = url_base
        self.UTC_correction = 2.00

    def parse_table(self, flighttable):
        dict_flighttable = []
        try:
            for flight in flighttable.tbody.findAll('tr') :
                try:
                    f_code = '' # clear
                    f_code = flight.find('a', attrs={'class', 'main-code'}).text # flight code
                    f_reg  = '' # clear
                    f_reg  = str( flight.find('a', attrs={'class', 'main-code'}).get('title') ) # registration number nested in 'title'
                    f_reg  = f_reg.split('number:')[1].split('<br')[0].strip()
                    f_loc  = flight.find('div', attrs={'class', 'airport'}).contents[0].strip() # location airport
                    f_time = flight.find('td', attrs={'class', 'plan'}).text.replace('\n', '') # scheduled time
                    f_texp = flight.find('td', attrs={'class', 'plan ext'}).text.replace('\n', '') # expected time
                    f_status = flight.find('td', attrs={'class', 'status'}).text.replace('\n', '') # status information
                    f_airc = str( flight.find('a', attrs={'class', 'main-code'}).get('title') )
                    f_airc = f_airc.split('Typ')[1].strip().replace(':', '').replace('e ', '')

                    entry = {}
                    entry['airportinformation'] = {}
                    entry['airportinformation']['airport_city'] = f_loc
                    entry['flightcode'] = f_code
                    entry['masterflight'] = {}
                    entry['masterflight']['registration'] = f_reg
                    entry['masterflight']['aircrafttype'] = f_airc
                    # entry['masterflight']['specialcs'] = {"\"specialcs\":"\"null\""} # unused ?
                    entry['scheduled'] = f_time
                    entry['expected'] = f_texp
                    entry['status'] = f_status

                    dict_flighttable.append({})
                    index = len(dict_flighttable)-1
                    dict_flighttable[index] = entry
                except AttributeError:
                    print('Parsing Error in flight')
                    print(flight)
        except AttributeError:
            # print(flighttable)
            print('Error')

        return dict_flighttable


    def fetch(self, fetch_type='arrival', spotter=False, tomorrow=False):
        if(fetch_type in ['arrival', 'arr', 'arrivals', 'Arrival']):
            fetch_type = 'Arrival'
        elif(fetch_type in ['departure', 'dep', 'departures', 'Departure']):
            fetch_type = 'Departure'
        else:
            raise ValueError('Unknown fetch type [arrival, departure]: {}'.format(fetch_type))

        # set time - 'HH:MM:SS'   ---   GMT/UTC format ! -> Swiss time 06:00:00 would be 04:00:00
        date_today = datetime.now().strftime('%Y-%m-%d')
        if(tomorrow==True):
            date_today = (datetime.now() + timedelta(days=1) ).strftime('%H:00:00')
        utc_time = (datetime.now() - timedelta(minutes = self.UTC_correction*60) ).strftime('%H:00:00')
	if(tomorrow==True):
            utc_time = datetime.now().strftime('02:00:00')
        page_n = 0
        search_term = ''
        if(spotter==True):
            search_term = 'spotter'

        dict_flighttable = {}
        dict_flighttable['timetable'] = []

        # fetch every flight by scrolling through pages
        last_flight_fetched = False
        while(not last_flight_fetched):
            # craft a request
            body = urllib.parse.urlencode({'startDateTime' : date_today + 'T' + utc_time + '.000Z', \
                                     'search' : search_term, \
                                     'page' : str(page_n), \
                                     '__RequestVerificationToken' : ''})
            url = self.url_base + fetch_type + 'DetailData'

            # send POST request and parse using LXML
            response = requests.post(url, data=body, headers=self.hdr)
            parsed_html = BeautifulSoup(response.text, 'lxml')

            # Chech if the website returns a "No Result Title" message when
            # the last page is reached.
            # this is an odd behaviour since in the webbrowser there is no such
            # Error message but the last available flight(s)
            # The Arrival page returns "No Result Title" ...
            # The Departure page returns "No flights found" ...
            end_of_table_error = parsed_html.find('div', attrs={'class', 'desktop-only'}).tbody.findAll('tr')[0].text
            if('No Result' in  end_of_table_error or 'No flights' in end_of_table_error):
                last_flight_fetched = True
            else:
                flighttable = parsed_html.find('div', attrs={'class', 'desktop-only'})

                if (page_n > 0):
                    if(dict_flighttable['timetable'][-1]['flightcode'] in flighttable):
                        last_flight_fetched = True
                        print('last page reached')
                        print(dict_flighttable['timetable'][-1]['flightcode'])

                # TODO: merging of tables not working
                table_to_add = self.parse_table(flighttable)
                dict_flighttable['timetable'].extend(table_to_add)


                page_n = page_n + 1

                if(page_n > 40):  # stop at 40+ requests
                    print('RunawayError: Too many pages requested. Aborting possible infinite loop')
                    last_flight_fetched = True

        return dict_flighttable


"""
<tr>
    <td class="plan">
        <div class="inactive">13:00</div>
    </td>
    <td class="plan ext">
        <div>12:58</div>
    </td>
    <td class="location">
        <div class="airport">OSLO
            <div>Gardermoen Intl.</div>
        </div>
    </td>
    <td class="flight-nr tooltip">
        <a class="main-code" href="/passengers-and-visitors/arrivals-and-departures/airlines-en/?id=SK" title="SAS Scandinavian Airlines &lt;br/&gt; Registration number: LNRGA  &lt;br/&gt; Typ: 737-800">SK 841</a>
        <div class="codeshare-wrap">
            <div class="shade"></div>
            <div class="codeshare">
                <div class="telop">
                    <strong>Codeshare:</strong>
                    <a href="/passengers-and-visitors/arrivals-and-departures/airlines-en/?id=LX" title="SWISS International Air Lines &lt;br/&gt; Operating airline: SAS Scandinavian Airlines &lt;br/&gt; Registration number: LNRGA &lt;br/&gt; Typ: 737-800">LX 4711</a>
                    <a href="/passengers-and-visitors/arrivals-and-departures/airlines-en/?id=OU" title="Croatia Airlines &lt;br/&gt; Operating airline: SAS Scandinavian Airlines &lt;br/&gt; Registration number: LNRGA &lt;br/&gt; Typ: 737-800">OU 5679</a>
                </div>
            </div>
        </div>
    </td>
    <td class="terminal">
        <a href="/~/media/flughafenzh/dokumente/uebersichtsplaene/check-in-12.pdf">1</a>
    </td>
    <td class="baggage">
        <div>15</div>
    </td>
    <td class="status">
        <div class="green">landed</div>
    </td>
</tr>
"""