33
173 }
174 out.println(
"
\t\t
BLOG from href:"
+ blogSubstring)
;
175 addBlogToDatabase(blogSubstring
,
currentDepth +
1
)
;
176 }
177 }
178 }
179 }
180 }
181
182
private
Document getPage (String url)
throws
IOException{
183 out.println(
"Reading url "
+ url)
;
184 Connection soupCon = Jsoup.connect(url)
;
185 soupCon.timeout(timeout)
;
186
return
soupCon.get()
;
187 }
188
189
190
private void
addBlogToDatabase(String url
,
int
depth){
191
if
(Blog.find(
"url = ?"
,
url).isEmpty()){
192 Blog.createIt(
"url"
,
url
,
"processed"
,
false
,
"error"
,
false
,
"depth"
,
depth)
;
193 out.println(
"
\t
Added blog "
+ url +
" to database table blogs."
)
;
194 }
195
else
{
196 out.println(
"
\t
Blog "
+ url +
"is already in the database table
blogs."
)
;
197 }
198
199 }
200
201
private boolean
isLinkInCache(Blog blog
,
String link){
202
if
(LinkCache.find(
"md5 = ?"
,
DigestUtils.md5Hex(link)).isEmpty()){
203 LinkCache.createIt(
"blog_id"
,
blog.get(
"id"
)
,
"md5"
,
DigestUtils.md5Hex(link))
;
204
return false
;
205 }
206
return true
;
207 }
208
209
private void
clearLinkCache(Blog blog) {
210 LinkCache.delete(
"blog_id = ?"
,
blog.get(
"id"
))
;
211 }
212
213
private void
addCommentToDatabase(String fromBlog
,
String toBlog
,
Date date){
214 out.println(
"
\t\t\t
Adding comment to database from: "
+ fromBlog +
" to:"
+ toBlog)
;
215 Model fromBlogFromDb = Blog.first(
"url = ?"
,
fromBlog)
;
216
if
(fromBlogFromDb ==
null
){
217 out.println(
"fromBlog is not valid (no hits in db!)"
)
;
218
return
;
219 }
220 Object fromBlogId = fromBlogFromDb.get(
"id"
)
;
221 Object toBlogId = Blog.first(
"url = ?"
,
toBlog).get(
"id"
)
;
222 Comment.createIt(
"from_blog"
,
fromBlogId
,
"to_blog"
,
toBlogId
,
"date"
,
date)
;
223 }
224 }
ExportToGEXF.java
1
package
info.asy.export
;
34
2
3
import
info.asy.model.Blog
;
4
import
info.asy.model.Comment
;
5
import
it.uniroma1.dis.wiserver.gexf4j.core.Edge
;
6
import
it.uniroma1.dis.wiserver.gexf4j.core.EdgeType
;
7
import
it.uniroma1.dis.wiserver.gexf4j.core.Gexf
;
8
import
it.uniroma1.dis.wiserver.gexf4j.core.Graph
;
9
import
it.uniroma1.dis.wiserver.gexf4j.core.Mode
;
10
import
it.uniroma1.dis.wiserver.gexf4j.core.Node
;
11
import
it.uniroma1.dis.wiserver.gexf4j.core.data.*
;
12
import
it.uniroma1.dis.wiserver.gexf4j.core.dynamic.TimeFormat
;
13
import
it.uniroma1.dis.wiserver.gexf4j.core.impl.GexfImpl
;
14
import
it.uniroma1.dis.wiserver.gexf4j.core.impl.StaxGraphWriter
;
15
import
it.uniroma1.dis.wiserver.gexf4j.core.impl.data.AttributeListImpl
;
16
17
import
java.io.File
;
18
import
java.io.FileWriter
;
19
import
java.io.IOException
;
20
import
java.io.Writer
;
21
import
java.util.ArrayList
;
22
import
java.util.Calendar
;
23
import
java.util.Date
;
24
import
java.util.List
;
25
26
import
org.apache.commons.validator.routines.UrlValidator
;
27
import
org.javalite.activejdbc.Base
;
28
import
org.javalite.activejdbc.LazyList
;
29
import
org.javalite.activejdbc.Model
;
30
import
org.joda.time.DateTime
;
31
32
public class
ExportToGEXF {
33
34
int
nodeCount =
0
;
35
int
edgeCount =
0
;
36
37
public void
export(String fileName){
38 Base.open(
"com.mysql.jdbc.Driver"
,
"jdbc:mysql://localhost/surfer_crawl"
,
"root"
,
"cellar"
)
;
39
40 List allComments = Comment.findBySQL(
"SELECT * FROM comments
ORDER BY date ASC;"
)
;
41 List allBlogs = Blog.find(
"processed = true"
)
;
42
43 Gexf gexf =
new
GexfImpl()
;
44 Calendar date = Calendar.getInstance()
;
45
46 gexf.getMetadata().setLastModified(date.getTime()).setCreator(
"Asy"
)
47 .setDescription(
"Prikaz blogova"
)
;
48
49 Graph graph = gexf.getGraph()
;
50 graph.setDefaultEdgeType(EdgeType.DIRECTED).setMode(Mode.DYNAMIC)
51 .setTimeType(TimeFormat.DATE)
;
52
53 AttributeList attrList =
new
AttributeListImpl(AttributeClass.EDGE)
;
54 attrList.setMode(Mode.DYNAMIC)
;
55 graph.getAttributeLists().add(attrList)
;
56 Attribute attWeight = attrList.createAttribute(
"weight"
,
AttributeType.FLOAT
,
"weight"
)
;
57
58 System.out.println(
"Found "
+ allBlogs.size() +
" blogs..."
)
;
59
int
currentBlog =
0
;