Skip to content

Commit 3c553ea

Browse files
committed
Added styling to the tables, modified the HTML output a bit. Also created a function for grabbing the page title
1 parent c681057 commit 3c553ea

File tree

2 files changed

+369
-49
lines changed

2 files changed

+369
-49
lines changed

screenshotClustering/cluster.py

Lines changed: 222 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,9 @@
99
from bs4 import BeautifulSoup
1010

1111
try:
12-
from urllib.parse import quote,unquote
12+
from urllib.parse import quote,unquote
1313
except:
14-
from urllib import quote,unquote
14+
from urllib import quote,unquote
1515

1616
def addAttrToBag(attrName,url,link,wordBags,soup):
1717
for tag in soup.findAll('',{attrName:True}):
@@ -61,7 +61,6 @@ def createWordBags(htmlList):
6161
addAttrToBag('class',f,False,wordBags,soup)
6262
addTagToBag('title',f,False,wordBags,soup)
6363
addTagToBag('h1',f,False,wordBags,soup)
64-
6564
return wordBags
6665

6766
def getNumWords(wordBag):
@@ -114,7 +113,6 @@ def createClusters(wordBags,threshold):
114113
if (clusterData[siteList[j]][0] <= threshold and score > clusterData[siteList[j]][0]):
115114
clusterData[siteList[j]][1] = i
116115
clusterData[siteList[j]][0] = score
117-
118116
return clusterData
119117

120118
def getScopeHtml(scopeFile):
@@ -126,65 +124,239 @@ def getScopeHtml(scopeFile):
126124
scopeText = scopeText + line+'<br/>'
127125
return scopeText
128126

127+
def getPageTitle(htmlFile):
128+
"""Simple function to yank page title from html"""
129+
with open(htmlFile, 'r') as f:
130+
soup = BeautifulSoup(f, "lxml")
131+
try:
132+
return soup.title.string.encode('ascii', 'ignore')
133+
except AttributeError:
134+
return "No Page Title Found"
135+
129136
def renderClusterHtml(clust,width,height,scopeFile=None):
130-
html = ''
131-
scopeHtml = getScopeHtml(scopeFile)
132-
header = '''
133-
<HTML>
134-
<title>Web Application Catalog</title>
135-
<BODY>
136-
<h1>Web Application Catalog</h1>
137-
'''
138-
if(scopeHtml is not None):
139-
header = header+scopeHtml
140-
header = header + '''
141-
<script type="text/javascript" src="popup.js"></script>
142-
<LINK href="style.css" rel="stylesheet" type="text/css">
143-
<h3>Catalog:</h3>
144-
'''
145-
html = html+'<table border="1">'
146-
147-
for cluster,siteList in clust.iteritems():
148-
html=html+'<TR>'
149-
screenshotName = quote(siteList[0][0:-4], safe='./')
150-
html=html+'<TR><TD><img src="'+screenshotName+'png" width='+str(width)+' height='+str(height)+'/></TD></TR>'
151-
for site in siteList:
152-
screenshotName = quote(site[0:-5], safe='./')
153-
html=html+'<TD onmouseout="clearPopup()" onmouseover="popUp(event,\''+screenshotName+'.png\');"><a href="'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'">'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'</a></TD>'
154-
html=html+'</TR>'
155-
html=html+'</table>'
156-
footer = '</BODY></HTML>'
157-
158-
return [header,html,footer]
137+
html = ''
138+
scopeHtml = getScopeHtml(scopeFile)
139+
header = '''
140+
<HTML>
141+
<title>Web Application Catalog</title>
142+
<BODY>
143+
'''
144+
if(scopeHtml is not None):
145+
header = header+scopeHtml
146+
header = header + '''
147+
<script type="text/javascript" src="popup.js"></script>
148+
<LINK href="style.css" rel="stylesheet" type="text/css">
149+
<div class="table-title">
150+
<h3>Web Application Catalog:</h3>
151+
</div>
152+
'''
153+
for cluster, siteList in clust.items():
154+
html = html + """
155+
<table class="table-fill">
156+
<thead>
157+
<TR>
158+
<th class="text-left" colspan="2">
159+
""" + getPageTitle(siteList[0]) + """ </th>
160+
</TR>
161+
</thead>
162+
<TR>
163+
"""
164+
screenshotName = quote(siteList[0][0:-4], safe='./')
165+
html = html + '<TD> <img src="'+screenshotName+'png" width='+str(width)+' height='+str(height)+'/></TD><TD>'
166+
for site in siteList:
167+
screenshotName = quote(site[0:-5], safe='./')
168+
if site != siteList[-1]:
169+
html = html + '<div onmouseout="clearPopup()" onmouseover="popUp(event,\''+screenshotName+'.png\');"><a href="'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'">'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'</a><br /></div>'
170+
else:
171+
html = html + '<div onmouseout="clearPopup()" onmouseover="popUp(event,\''+screenshotName+'.png\');"><a href="'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'">'+unquote(unquote(screenshotName[2:]).decode("utf-8")).decode("utf-8")+'</a></div> </TD></TR></table>'
172+
173+
174+
footer = '</BODY></HTML>'
175+
return [header,html,footer]
176+
177+
178+
159179
def printJS():
160180
js = """
161181
function popUp(e,src)
162182
{
163-
x = e.clientX;
164-
y = e.clientY;
165-
166-
var img = document.createElement("img");
167-
img.src = src;
168-
img.setAttribute("class","popUp");
169-
img.setAttribute("style","position:fixed;left:"+(x+15)+";top:"+0+";background-color:white");
170-
//img.setAttribute("onmouseout","clearPopup(event)")
171-
// This next line will just add it to the <body> tag
172-
document.body.appendChild(img);
183+
x = e.clientX;
184+
y = e.clientY;
185+
186+
var img = document.createElement("img");
187+
img.src = src;
188+
img.setAttribute("class","popUp");
189+
img.setAttribute("style","position:fixed;left:"+(x+15)+";top:"+0+";background-color:white");
190+
//img.setAttribute("onmouseout","clearPopup(event)")
191+
// This next line will just add it to the <body> tag
192+
document.body.appendChild(img);
173193
}
174194
175195
function clearPopup()
176196
{
177-
var popUps = document.getElementsByClassName('popUp');
178-
while(popUps[0]) {
179-
popUps[0].parentNode.removeChild(popUps[0]);
180-
}
197+
var popUps = document.getElementsByClassName('popUp');
198+
while(popUps[0]) {
199+
popUps[0].parentNode.removeChild(popUps[0]);
200+
}
181201
}
182202
"""
183203

184204
f = open('popup.js','w')
185205
f.write(js)
186206
f.close()
187207

208+
def printCSS():
209+
css = """
210+
@import url(http://fonts.googleapis.com/css?family=Roboto:400,500,700,300,100);
211+
212+
body {
213+
background-color: #3e94ec;
214+
font-family: "Roboto", helvetica, arial, sans-serif;
215+
font-size: 16px;
216+
font-weight: 400;
217+
text-rendering: optimizeLegibility;
218+
}
219+
220+
div.table-title {
221+
display: block;
222+
margin: auto;
223+
max-width: 600px;
224+
padding:5px;
225+
width: 100%;
226+
}
227+
228+
.table-title h3 {
229+
color: #fafafa;
230+
font-size: 30px;
231+
font-weight: 400;
232+
font-style:normal;
233+
font-family: "Roboto", helvetica, arial, sans-serif;
234+
text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1);
235+
text-transform:uppercase;
236+
}
237+
238+
239+
/*** Table Styles **/
240+
241+
.table-fill {
242+
background: white;
243+
border-radius:3px;
244+
border-collapse: collapse;
245+
height: 320px;
246+
margin: auto;
247+
margin-bottom: 50px;
248+
max-width: 600px;
249+
padding:5px;
250+
width: 100%;
251+
box-shadow: 0 5px 10px rgba(0, 0, 0, 0.1);
252+
animation: float 5s infinite;
253+
}
254+
255+
th {
256+
color:#D5DDE5;;
257+
background:#1b1e24;
258+
border-bottom:4px solid #9ea7af;
259+
border-right: 1px solid #343a45;
260+
font-size:23px;
261+
font-weight: 100;
262+
padding:24px;
263+
text-align:left;
264+
text-shadow: 0 1px 1px rgba(0, 0, 0, 0.1);
265+
vertical-align:middle;
266+
}
267+
268+
th:first-child {
269+
border-top-left-radius:3px;
270+
}
271+
272+
th:last-child {
273+
border-top-right-radius:3px;
274+
border-right:none;
275+
}
276+
277+
tr {
278+
border-top: 1px solid #C1C3D1;
279+
border-bottom-: 1px solid #C1C3D1;
280+
color:#666B85;
281+
font-size:16px;
282+
font-weight:normal;
283+
text-shadow: 0 1px 1px rgba(256, 256, 256, 0.1);
284+
}
285+
286+
tr:hover td {
287+
background:#4E5066;
288+
color:#FFFFFF;
289+
border-top: 1px solid #22262e;
290+
border-bottom: 1px solid #22262e;
291+
}
292+
293+
tr:first-child {
294+
border-top:none;
295+
}
296+
297+
tr:last-child {
298+
border-bottom:none;
299+
}
300+
301+
tr:nth-child(odd) td {
302+
background:#EBEBEB;
303+
}
304+
305+
tr:nth-child(odd):hover td {
306+
background:#4E5066;
307+
}
308+
309+
tr:last-child td:first-child {
310+
border-bottom-left-radius:3px;
311+
}
312+
313+
tr:last-child td:last-child {
314+
border-bottom-right-radius:3px;
315+
}
316+
317+
td {
318+
background:#FFFFFF;
319+
padding:20px;
320+
text-align:left;
321+
vertical-align:middle;
322+
font-weight:300;
323+
font-size:18px;
324+
text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1);
325+
border-right: 1px solid #C1C3D1;
326+
}
327+
328+
td:last-child {
329+
border-right: 0px;
330+
}
331+
332+
th.text-left {
333+
text-align: left;
334+
}
335+
336+
th.text-center {
337+
text-align: center;
338+
}
339+
340+
th.text-right {
341+
text-align: right;
342+
}
343+
344+
td.text-left {
345+
text-align: left;
346+
}
347+
348+
td.text-center {
349+
text-align: center;
350+
}
351+
352+
td.text-right {
353+
text-align: right;
354+
}
355+
"""
356+
f = open('style.css','w')
357+
f.write(css)
358+
f.close()
359+
188360
def doCluster(htmlList):
189361
siteWordBags = createWordBags(htmlList)
190362
clusterData = createClusters(siteWordBags,0.6)
@@ -268,7 +440,7 @@ def doDiff(htmlList,diffList):
268440
htmlRegex = re.compile('.*html.*')
269441
for fileName in os.listdir(path):
270442
if(htmlRegex.match(fileName)):
271-
htmlList.append(path+fileName)
443+
htmlList.append(path+fileName)
272444

273445
n = len(htmlList)
274446

@@ -281,7 +453,7 @@ def doDiff(htmlList,diffList):
281453
diffList = []
282454
for fileName in os.listdir(args.diff):
283455
if(htmlRegex.match(fileName)):
284-
diffList.append(args.diff+fileName)
456+
diffList.append(args.diff+fileName)
285457

286458
lists = doDiff(htmlList,diffList)
287459

@@ -315,4 +487,5 @@ def doDiff(htmlList,diffList):
315487
f = open(args.output,'w')
316488
f.write(html)
317489
printJS()
490+
printCSS()
318491

0 commit comments

Comments
 (0)