99from bs4 import BeautifulSoup
1010
1111try :
12- from urllib .parse import quote ,unquote
12+ from urllib .parse import quote ,unquote
1313except :
14- from urllib import quote ,unquote
14+ from urllib import quote ,unquote
1515
1616def addAttrToBag (attrName ,url ,link ,wordBags ,soup ):
1717 for tag in soup .findAll ('' ,{attrName :True }):
@@ -61,7 +61,6 @@ def createWordBags(htmlList):
6161 addAttrToBag ('class' ,f ,False ,wordBags ,soup )
6262 addTagToBag ('title' ,f ,False ,wordBags ,soup )
6363 addTagToBag ('h1' ,f ,False ,wordBags ,soup )
64-
6564 return wordBags
6665
6766def getNumWords (wordBag ):
@@ -114,7 +113,6 @@ def createClusters(wordBags,threshold):
114113 if (clusterData [siteList [j ]][0 ] <= threshold and score > clusterData [siteList [j ]][0 ]):
115114 clusterData [siteList [j ]][1 ] = i
116115 clusterData [siteList [j ]][0 ] = score
117-
118116 return clusterData
119117
120118def getScopeHtml (scopeFile ):
@@ -126,65 +124,239 @@ def getScopeHtml(scopeFile):
126124 scopeText = scopeText + line + '<br/>'
127125 return scopeText
128126
127+ def getPageTitle (htmlFile ):
128+ """Simple function to yank page title from html"""
129+ with open (htmlFile , 'r' ) as f :
130+ soup = BeautifulSoup (f , "lxml" )
131+ try :
132+ return soup .title .string .encode ('ascii' , 'ignore' )
133+ except AttributeError :
134+ return "No Page Title Found"
135+
129136def renderClusterHtml (clust ,width ,height ,scopeFile = None ):
130- html = ''
131- scopeHtml = getScopeHtml (scopeFile )
132- header = '''
133- <HTML>
134- <title>Web Application Catalog</title>
135- <BODY>
136- <h1>Web Application Catalog</h1>
137- '''
138- if (scopeHtml is not None ):
139- header = header + scopeHtml
140- header = header + '''
141- <script type="text/javascript" src="popup.js"></script>
142- <LINK href="style.css" rel="stylesheet" type="text/css">
143- <h3>Catalog:</h3>
144- '''
145- html = html + '<table border="1">'
146-
147- for cluster ,siteList in clust .iteritems ():
148- html = html + '<TR>'
149- screenshotName = quote (siteList [0 ][0 :- 4 ], safe = './' )
150- html = html + '<TR><TD><img src="' + screenshotName + 'png" width=' + str (width )+ ' height=' + str (height )+ '/></TD></TR>'
151- for site in siteList :
152- screenshotName = quote (site [0 :- 5 ], safe = './' )
153- html = html + '<TD onmouseout="clearPopup()" onmouseover="popUp(event,\' ' + screenshotName + '.png\' );"><a href="' + unquote (unquote (screenshotName [2 :]).decode ("utf-8" )).decode ("utf-8" )+ '">' + unquote (unquote (screenshotName [2 :]).decode ("utf-8" )).decode ("utf-8" )+ '</a></TD>'
154- html = html + '</TR>'
155- html = html + '</table>'
156- footer = '</BODY></HTML>'
157-
158- return [header ,html ,footer ]
137+ html = ''
138+ scopeHtml = getScopeHtml (scopeFile )
139+ header = '''
140+ <HTML>
141+ <title>Web Application Catalog</title>
142+ <BODY>
143+ '''
144+ if (scopeHtml is not None ):
145+ header = header + scopeHtml
146+ header = header + '''
147+ <script type="text/javascript" src="popup.js"></script>
148+ <LINK href="style.css" rel="stylesheet" type="text/css">
149+ <div class="table-title">
150+ <h3>Web Application Catalog:</h3>
151+ </div>
152+ '''
153+ for cluster , siteList in clust .items ():
154+ html = html + """
155+ <table class="table-fill">
156+ <thead>
157+ <TR>
158+ <th class="text-left" colspan="2">
159+ """ + getPageTitle (siteList [0 ]) + """ </th>
160+ </TR>
161+ </thead>
162+ <TR>
163+ """
164+ screenshotName = quote (siteList [0 ][0 :- 4 ], safe = './' )
165+ html = html + '<TD> <img src="' + screenshotName + 'png" width=' + str (width )+ ' height=' + str (height )+ '/></TD><TD>'
166+ for site in siteList :
167+ screenshotName = quote (site [0 :- 5 ], safe = './' )
168+ if site != siteList [- 1 ]:
169+ html = html + '<div onmouseout="clearPopup()" onmouseover="popUp(event,\' ' + screenshotName + '.png\' );"><a href="' + unquote (unquote (screenshotName [2 :]).decode ("utf-8" )).decode ("utf-8" )+ '">' + unquote (unquote (screenshotName [2 :]).decode ("utf-8" )).decode ("utf-8" )+ '</a><br /></div>'
170+ else :
171+ html = html + '<div onmouseout="clearPopup()" onmouseover="popUp(event,\' ' + screenshotName + '.png\' );"><a href="' + unquote (unquote (screenshotName [2 :]).decode ("utf-8" )).decode ("utf-8" )+ '">' + unquote (unquote (screenshotName [2 :]).decode ("utf-8" )).decode ("utf-8" )+ '</a></div> </TD></TR></table>'
172+
173+
174+ footer = '</BODY></HTML>'
175+ return [header ,html ,footer ]
176+
177+
178+
159179def printJS ():
160180 js = """
161181 function popUp(e,src)
162182 {
163- x = e.clientX;
164- y = e.clientY;
165-
166- var img = document.createElement("img");
167- img.src = src;
168- img.setAttribute("class","popUp");
169- img.setAttribute("style","position:fixed;left:"+(x+15)+";top:"+0+";background-color:white");
170- //img.setAttribute("onmouseout","clearPopup(event)")
171- // This next line will just add it to the <body> tag
172- document.body.appendChild(img);
183+ x = e.clientX;
184+ y = e.clientY;
185+
186+ var img = document.createElement("img");
187+ img.src = src;
188+ img.setAttribute("class","popUp");
189+ img.setAttribute("style","position:fixed;left:"+(x+15)+";top:"+0+";background-color:white");
190+ //img.setAttribute("onmouseout","clearPopup(event)")
191+ // This next line will just add it to the <body> tag
192+ document.body.appendChild(img);
173193 }
174194
175195 function clearPopup()
176196 {
177- var popUps = document.getElementsByClassName('popUp');
178- while(popUps[0]) {
179- popUps[0].parentNode.removeChild(popUps[0]);
180- }
197+ var popUps = document.getElementsByClassName('popUp');
198+ while(popUps[0]) {
199+ popUps[0].parentNode.removeChild(popUps[0]);
200+ }
181201 }
182202 """
183203
184204 f = open ('popup.js' ,'w' )
185205 f .write (js )
186206 f .close ()
187207
208+ def printCSS ():
209+ css = """
210+ @import url(http://fonts.googleapis.com/css?family=Roboto:400,500,700,300,100);
211+
212+ body {
213+ background-color: #3e94ec;
214+ font-family: "Roboto", helvetica, arial, sans-serif;
215+ font-size: 16px;
216+ font-weight: 400;
217+ text-rendering: optimizeLegibility;
218+ }
219+
220+ div.table-title {
221+ display: block;
222+ margin: auto;
223+ max-width: 600px;
224+ padding:5px;
225+ width: 100%;
226+ }
227+
228+ .table-title h3 {
229+ color: #fafafa;
230+ font-size: 30px;
231+ font-weight: 400;
232+ font-style:normal;
233+ font-family: "Roboto", helvetica, arial, sans-serif;
234+ text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1);
235+ text-transform:uppercase;
236+ }
237+
238+
239+ /*** Table Styles **/
240+
241+ .table-fill {
242+ background: white;
243+ border-radius:3px;
244+ border-collapse: collapse;
245+ height: 320px;
246+ margin: auto;
247+ margin-bottom: 50px;
248+ max-width: 600px;
249+ padding:5px;
250+ width: 100%;
251+ box-shadow: 0 5px 10px rgba(0, 0, 0, 0.1);
252+ animation: float 5s infinite;
253+ }
254+
255+ th {
256+ color:#D5DDE5;;
257+ background:#1b1e24;
258+ border-bottom:4px solid #9ea7af;
259+ border-right: 1px solid #343a45;
260+ font-size:23px;
261+ font-weight: 100;
262+ padding:24px;
263+ text-align:left;
264+ text-shadow: 0 1px 1px rgba(0, 0, 0, 0.1);
265+ vertical-align:middle;
266+ }
267+
268+ th:first-child {
269+ border-top-left-radius:3px;
270+ }
271+
272+ th:last-child {
273+ border-top-right-radius:3px;
274+ border-right:none;
275+ }
276+
277+ tr {
278+ border-top: 1px solid #C1C3D1;
279+ border-bottom-: 1px solid #C1C3D1;
280+ color:#666B85;
281+ font-size:16px;
282+ font-weight:normal;
283+ text-shadow: 0 1px 1px rgba(256, 256, 256, 0.1);
284+ }
285+
286+ tr:hover td {
287+ background:#4E5066;
288+ color:#FFFFFF;
289+ border-top: 1px solid #22262e;
290+ border-bottom: 1px solid #22262e;
291+ }
292+
293+ tr:first-child {
294+ border-top:none;
295+ }
296+
297+ tr:last-child {
298+ border-bottom:none;
299+ }
300+
301+ tr:nth-child(odd) td {
302+ background:#EBEBEB;
303+ }
304+
305+ tr:nth-child(odd):hover td {
306+ background:#4E5066;
307+ }
308+
309+ tr:last-child td:first-child {
310+ border-bottom-left-radius:3px;
311+ }
312+
313+ tr:last-child td:last-child {
314+ border-bottom-right-radius:3px;
315+ }
316+
317+ td {
318+ background:#FFFFFF;
319+ padding:20px;
320+ text-align:left;
321+ vertical-align:middle;
322+ font-weight:300;
323+ font-size:18px;
324+ text-shadow: -1px -1px 1px rgba(0, 0, 0, 0.1);
325+ border-right: 1px solid #C1C3D1;
326+ }
327+
328+ td:last-child {
329+ border-right: 0px;
330+ }
331+
332+ th.text-left {
333+ text-align: left;
334+ }
335+
336+ th.text-center {
337+ text-align: center;
338+ }
339+
340+ th.text-right {
341+ text-align: right;
342+ }
343+
344+ td.text-left {
345+ text-align: left;
346+ }
347+
348+ td.text-center {
349+ text-align: center;
350+ }
351+
352+ td.text-right {
353+ text-align: right;
354+ }
355+ """
356+ f = open ('style.css' ,'w' )
357+ f .write (css )
358+ f .close ()
359+
188360def doCluster (htmlList ):
189361 siteWordBags = createWordBags (htmlList )
190362 clusterData = createClusters (siteWordBags ,0.6 )
@@ -268,7 +440,7 @@ def doDiff(htmlList,diffList):
268440 htmlRegex = re .compile ('.*html.*' )
269441 for fileName in os .listdir (path ):
270442 if (htmlRegex .match (fileName )):
271- htmlList .append (path + fileName )
443+ htmlList .append (path + fileName )
272444
273445 n = len (htmlList )
274446
@@ -281,7 +453,7 @@ def doDiff(htmlList,diffList):
281453 diffList = []
282454 for fileName in os .listdir (args .diff ):
283455 if (htmlRegex .match (fileName )):
284- diffList .append (args .diff + fileName )
456+ diffList .append (args .diff + fileName )
285457
286458 lists = doDiff (htmlList ,diffList )
287459
@@ -315,4 +487,5 @@ def doDiff(htmlList,diffList):
315487 f = open (args .output ,'w' )
316488 f .write (html )
317489 printJS ()
490+ printCSS ()
318491
0 commit comments