31
54 }
55
56
private void
obradiBlog(Blog blog){
57 String blogUrl = blog.getString(
"url"
)
;
58
if
(crawlPage(blogUrl
,
blog)){
59
while
(LinkToDo.findFirst(
""
) !=
null
){
60 Model currentLink = LinkToDo.findFirst(
""
)
;
61 crawlPage(currentLink.getString(
"url"
)
,
blog)
;
62 currentLink.delete()
;
63 }
64 blog.set(
"processed"
,
true
).saveIt()
;
65 clearLinkCache(blog)
;
66 LinkToDo.deleteAll()
;
67 }
68
else
{
69 blog.set(
"error"
,
true
).saveIt()
;
70 out.println(
"Nije bilo moguce pristupiti blogu: "
+ blogUrl +
71
"
\n
napravit cu pauzu od 10 min i zatim pokusat sa slijedecim blogom..."
)
;
72
try
{
73 Thread.sleep(
600000
)
;
74 }
catch
(InterruptedException e) {
75 e.printStackTrace()
;
76 }
77 }
78 }
79
80
81
private boolean
crawlPage (String pageUrl
,
Blog blog){
82 String blogUrl = blog.getString(
"url"
)
;
83 out.println(
"
\t
Crawling page..."
+ pageUrl +
"
\n\t
on blog..."
+ blogUrl)
;
84
//mali wait kako ne bi slali previse upita na blog.hr
85
try
{
86 Thread.sleep(fairPlayTime)
;
87 }
catch
(InterruptedException e) {
88 e.printStackTrace()
;
89 }
90
91 Document page =
null
;
92
int
retry =
0
;
93
while
(page ==
null
&& retry <= retryLimit){
94
try
{
95 page = getPage(pageUrl)
;
96 }
catch
(IOException e) {
97 retry++
;
98 out.println(
"Could not read page... retrying "
+ retry)
;
99
if
(retry == retryLimit){
100 out.println(
"All retries have failed."
)
;
101 out.println(
"Skipping page..."
)
;
102
return false
;
103 }
104 }
105 }
106 extractLinks(blog
,
page)
;
107 extractComments(blogUrl
,
page)
;
108
return true
;
109 }
110
111
112
private void
extractComments(String blogUrl
,
Document page) {
113 Elements listComment = page.select(
"li[id^=comment]"
)
;
114
for
(Element comment : listComment){
32
115 String fromBlog =
null
;
116
Date date
;
117
118 out.println(
"
\t\t
Comment:"
)
;
119 Element author = comment.select(
"a[href^=http]"
).first()
;
120
if
(author ==
null
){
121 out.println(
"
\t\t\t
No author! Skipping..."
)
;
122
continue
;
123 }
124 fromBlog = author.attr(
"href"
)
;
125
// provjerit ce je li blog s blog.hr i je li je validan url
126
// ovako izbjegava anonimne i ne-blog.hr komentare
127
if
(!fromBlog.contains(
"blog.hr"
)){
128 out.println(
"
\t\t\t
Blog does not originate from blog.hr!"
)
;
129
continue
;
130 }
131
else if
(!validator.isValid(fromBlog)){
132 out.println(
"
\t\t\t
Url is not valid! url:"
+ fromBlog )
;
133
continue
;
134 }
135 out.println(
"
\t\t\t
from blog:"
+ fromBlog)
;
136 out.println(
"
\t\t\t
author: "
+ author.text())
;
137
138 String ispodPosta = comment.select(
"p.ispodposta"
).first().text()
;
139
try
{
140 date =
new
SimpleDateFormat(
"dd.MM.yyyy. (HH:ss)"
,
Locale.ENGLISH).parse(ispodPosta)
;
141 out.println(
"
\t\t\t
date:"
+ date)
;
142 }
catch
(ParseException e) {
143 out.println(
"Could not parse date..."
)
;
144
continue
;
145 }
146 addCommentToDatabase(fromBlog
,
blogUrl
,
date)
;
147 }
148 }
149
150
private void
extractLinks(Blog blog
,
Document page) {
151 String blogUrl = blog.getString(
"url"
)
;
152
int
currentDepth = blog.getInteger(
"depth"
)
;
153 Elements links = page.select(
"a[href]"
)
;
154
for
(Element link : links) {
155 String href = link.attr(
"href"
)
;
156
if
(validator.isValid(href)) {
157
if
(!isLinkInCache(blog
,
href)) {
158
if
(href.contains(blogUrl.replace(
"http://"
,
""
))) {
159 LinkToDo.createIt(
"url"
,
href)
;
160 }
else if
(href.contains(
"blog.hr/komentari"
)) {
161 LinkToDo.createIt(
"url"
,
href)
;
162 }
else if
(href.contains(
"blog.hr/print"
) ||
href.contains(
"www.blog.hr"
) || href.contains(
"mailto:"
)) {
163
//linkOstali.add(link);
164 }
else if
(href.contains(
"blog.hr"
)) {
165
//bilo koji blog.hr url pretvorit u http://imebloga.blog.hr
166 out.println(
"
\t\t
HREF:"
+ href)
;
167 String[] splitHref = href.split(
"//"
)
;
168 String blogSubstring = href
;
169
if
(splitHref[
1
].contains(
"/"
)) {
170
/*ukoliko ima nastavak url npr. nekiblog.blog.hr/nesto/nesto.hr
171
izvuc samo nekiblog.blog.hr */
172 blogSubstring = href.substring(
0
,
href.indexOf(
"/"
,
href.indexOf(
"//"
) +
2
))
;