Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
#! /usr/bin/env python
import sys
import os
import logging
import argparse
import glob
import datetime
import random
from collections import defaultdict
sys.path.append('./web-profiler')
from webloader.har import Har, HarError
# How much data should go in each slice if there is only one slice
def slice_sizes_one_slice(obj, dummy1, dummy2):
request_size = obj.request_headers_size + max(obj.request_body_size, 0)
response_size = obj.response_headers_size + obj.response_body_size
return request_size, response_size
# How much data should go in each slice if there's one slice each for (1) req
# headers, (2) req body, (3) resp headers, (4) resp body
def slice_sizes_headers_content(obj, dummy1, dummy2):
request_sizes = '%d_%d_0_0' % (obj.request_headers_size,\
max(obj.request_body_size, 0))
response_sizes = '0_0_%d_%d' % (obj.response_headers_size, obj.response_body_size)
return request_sizes, response_sizes
max_slices = 0
# How much data should go in each slice if each HTTP header gets its own slice?
def slice_sizes_slice_per_header(obj, num_req_slices, num_resp_slices):
global max_slices
total_slices = num_req_slices + num_resp_slices + 3 # +1 for GET, +2 bodies
# get request header sizes
request_sizes = ''
total_req_hdr_size = 2 # final \n\r
for header, value in obj.request_headers.iteritems():
req_hdr_size = len(header) + len(value) + 4 # : \n\r
total_req_hdr_size += req_hdr_size
request_sizes += '_%d' % req_hdr_size
# NOTE: we're losing the 2 byte \n\r following the last header
# add the size of the request line (GET ... )
request_sizes = '%d%s' % (obj.request_headers_size - total_req_hdr_size,\
request_sizes)
# add content size
request_sizes += '_%d' % max(obj.request_body_size, 0)
# pad with 0's for extra request slices + response slices
request_sizes += '_0' * (total_slices - len(obj.request_headers))
# get response header sizes
response_sizes = ''
total_resp_hdr_size = 2 # final \n\r
for header, value in obj.response_headers.iteritems():
resp_hdr_size = len(header) + len(value) + 4 # : \n\r
total_resp_hdr_size += resp_hdr_size
response_sizes += '_%d' % resp_hdr_size
# NOTE: we're losing the 2 byte \n\r following the last header
# add the size of the response line (GET ... )
response_sizes = '%d%s' % (obj.response_headers_size - total_resp_hdr_size,\
response_sizes)
# add content size
response_sizes += '_%d' % max(obj.response_body_size, 0)
# pad with 0's for extra response slices + request slices
response_sizes = '0_' *\
(total_slices - len(obj.response_headers))\
+ response_sizes
# make sure we have the right number of slices in each column
if len(request_sizes.split('_')) != len(response_sizes.split('_')):
print 'WRONG', len(request_sizes.split('_')), len(response_sizes.split('_'))
max_slices = max(total_slices, max_slices)
return request_sizes, response_sizes
def save_action_list_for_har(har_path, slice_sizes_func, slice_tag):
# load HAR
try:
har = Har.from_file(har_path)
logging.info(har)
except HarError:
logging.exception('Error parsing HAR')
return
# make sure the first response is a 200 OK
if har.objects[0].response_code != 200:
logging.warn('Not 200 OK; skipping. %s' % har)
return
# count the max # HTTP request and response headers in any request
# (so we know how many slices we need for slice-per-header scenario)
num_req_slices = 0
num_resp_slices = 0
for obj in har.objects:
num_req_slices = max(num_req_slices, len(obj.request_headers))
num_resp_slices = max(num_resp_slices, len(obj.response_headers))
# FILE 1:
# write a simpler "action list" for Matteo's client
# fields:
# time size assigned_conn
action_path = os.path.splitext(har_path)[0] + '.%s.actions' % slice_tag
with open(action_path, 'w') as actionf:
# FILE 2:
# write CSV list of objects along with time offset from beginning.
out_path = os.path.splitext(har_path)[0] + '.%s.csv' % slice_tag
with open(out_path, 'w') as detailsf:
detailsf.write('host,compressed size (bytes),original size (bytes),request slice bytes,response slice bytes,request start offset (sec),new TCP connection?,new SSL handshake?,new conn?,#conn so far,assigned conn,path,original URL\n\n')
# LOOP OVER OBJECTS
last_timestamp = None
servers_so_far = set() # track which servers this page has loaded from
server_to_num_conn = defaultdict(int)
for obj in har.objects:
# did chrome open a new connection for this object?
new_connection = obj.tcp_handshake\
or obj.ssl_handshake\
or obj.host not in servers_so_far
# is this a new connection for this server?
assigned_connection = -1
if new_connection:
# if so, add to count
server_to_num_conn[obj.host] += 1
assigned_connection = server_to_num_conn[obj.host] - 1
else:
# if not, assign to random open connection
assigned_connection = random.randint(0,\
server_to_num_conn[obj.host] - 1)
# how much data should go in each slice for req and resp?
request_slice_sizes, response_slice_sizes =\
slice_sizes_func(obj, num_req_slices, num_resp_slices)
# FILE 1 (actions)
actionf.write('%f %s;%s %d\n' %\
((obj.object_start_time - har.page_start_time).total_seconds(),\
request_slice_sizes,\
response_slice_sizes,\
assigned_connection))
# FILE 2 (details)
detailsf.write('%s,%d,%d,%s,%s,%f,%s,%s,%s,%d,%d,%s,%s\n' %\
(obj.host,\
obj.size,\
obj.content_size,\
request_slice_sizes,\
response_slice_sizes,\
(obj.object_start_time - har.page_start_time).total_seconds(),\
obj.tcp_handshake,\
obj.ssl_handshake,\
new_connection,\
server_to_num_conn[obj.host],\
assigned_connection,
obj.path,\
obj.url))
last_timestamp = obj.object_start_time
servers_so_far.add(obj.host)
def main():
if len(args.hars) == 1 and os.path.isdir(args.hars[0]):
harpaths = glob.glob(args.hars[0] + '/*.har')
else:
harpaths = args.hars
for harpath in harpaths:
save_action_list_for_har(harpath, slice_sizes_one_slice, 'one-slice')
save_action_list_for_har(harpath, slice_sizes_headers_content, 'four-slices')
save_action_list_for_har(harpath, slice_sizes_slice_per_header, 'slice-per-header')
print max_slices
if __name__ == '__main__':
# set up command line args
parser = argparse.ArgumentParser(description='Extract object request timings from a HAR file.')
parser.add_argument('hars', nargs='+', help='HAR files (or directory of HARs) to analyze')
parser.add_argument('-q', '--quiet', action='store_true', default=False, help='only print errors')
parser.add_argument('-v', '--verbose', action='store_true', default=False, help='print debug info. --quiet wins if both are present')
args = parser.parse_args()
# set up logging
if args.quiet:
level = logging.WARNING
elif args.verbose:
level = logging.DEBUG
else:
level = logging.INFO
logging.basicConfig(
format = "%(levelname) -10s %(asctime)s %(module)s:%(lineno) -7s %(message)s",
level = level
)
main()