TitleCrawler : Use scala-scraper.
This commit is contained in:
parent
add30216b5
commit
57c4031377
|
@ -1,11 +1,24 @@
|
||||||
package com.voronind.doublegis.test
|
package com.voronind.doublegis.test
|
||||||
package model.crawler
|
package model.crawler
|
||||||
|
|
||||||
|
import model.crawler.TitleCrawler.TITLE_UNKNOWN
|
||||||
|
|
||||||
|
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* A crawler that extracts Http Head Title.
|
* A crawler that extracts Http Head Title.
|
||||||
*/
|
*/
|
||||||
class TitleCrawler extends Crawler {
|
class TitleCrawler extends Crawler {
|
||||||
override def crawl(url: String): String = {
|
override def crawl(url: String): String = {
|
||||||
"Mock title."
|
try {
|
||||||
|
val browser = JsoupBrowser()
|
||||||
|
val html = browser.get(url)
|
||||||
|
html.title
|
||||||
|
} catch
|
||||||
|
case e: Exception => TITLE_UNKNOWN
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
object TitleCrawler {
|
||||||
|
private val TITLE_UNKNOWN = "Unable to extract the title."
|
||||||
|
}
|
||||||
|
|
|
@ -1,12 +1,12 @@
|
||||||
package com.voronind.doublegis.test
|
package com.voronind.doublegis.test
|
||||||
package model.handler
|
package model.handler
|
||||||
|
|
||||||
|
import model.crawler.TitleCrawler
|
||||||
import model.lib.{HttpUtil, StreamUtil}
|
import model.lib.{HttpUtil, StreamUtil}
|
||||||
|
|
||||||
import com.sun.net.httpserver.{HttpExchange, HttpHandler}
|
import com.sun.net.httpserver.{HttpExchange, HttpHandler}
|
||||||
import com.voronind.doublegis.test.model.crawler.TitleCrawler
|
|
||||||
|
|
||||||
import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
|
import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader}
|
||||||
import scala.language.postfixOps
|
import scala.language.postfixOps
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -28,20 +28,20 @@ class TitleCrawlerHandler extends HttpHandler, Handler {
|
||||||
// I don't know if this one is ugly, but I wanted to show off a bit.
|
// I don't know if this one is ugly, but I wanted to show off a bit.
|
||||||
extension (exchange: HttpExchange) private def sendResponse(request: Array[Byte]): Unit = {
|
extension (exchange: HttpExchange) private def sendResponse(request: Array[Byte]): Unit = {
|
||||||
val reader = new BufferedReader(new InputStreamReader(ByteArrayInputStream(request)))
|
val reader = new BufferedReader(new InputStreamReader(ByteArrayInputStream(request)))
|
||||||
|
val response = Iterator
|
||||||
val result = Iterator
|
|
||||||
.continually(reader.readLine)
|
.continually(reader.readLine)
|
||||||
.takeWhile(null !=)
|
.takeWhile(null !=)
|
||||||
.filter(HttpUtil.isUrl)
|
.filter(HttpUtil.isUrl)
|
||||||
.map({ url => f"$url => ${runCrawler(url)}" })
|
.map({ url => s"$url => ${runCrawler(url)}" })
|
||||||
.toList
|
.mkString("\n")
|
||||||
|
.getBytes()
|
||||||
|
|
||||||
val response = result.mkString("\n")
|
exchange.sendResponseHeaders(200, response.length)
|
||||||
exchange.sendResponseHeaders(200, response.length())
|
|
||||||
|
|
||||||
val out = exchange.getResponseBody
|
val output = exchange.getResponseBody
|
||||||
out.write(response.getBytes)
|
|
||||||
out.close()
|
StreamUtil.copyStream(new ByteArrayInputStream(response), output)
|
||||||
|
output.close()
|
||||||
}
|
}
|
||||||
|
|
||||||
private def runCrawler(url: String): String = {
|
private def runCrawler(url: String): String = {
|
||||||
|
|
Reference in a new issue