TitleCrawler : Use scala-scraper.

This commit is contained in:
Dmitry Voronin 2024-09-23 18:15:50 +03:00
parent add30216b5
commit 57c4031377
Signed by: voronind
SSH key fingerprint: SHA256:3kBb4iV2ahufEBNq+vFbUe4QYfHt98DHQjN7QaptY9k
2 changed files with 25 additions and 12 deletions

View file

@ -1,11 +1,24 @@
package com.voronind.doublegis.test package com.voronind.doublegis.test
package model.crawler package model.crawler
import model.crawler.TitleCrawler.TITLE_UNKNOWN
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
/** /**
* A crawler that extracts Http Head Title. * A crawler that extracts Http Head Title.
*/ */
class TitleCrawler extends Crawler { class TitleCrawler extends Crawler {
override def crawl(url: String): String = { override def crawl(url: String): String = {
"Mock title." try {
val browser = JsoupBrowser()
val html = browser.get(url)
html.title
} catch
case e: Exception => TITLE_UNKNOWN
} }
} }
object TitleCrawler {
private val TITLE_UNKNOWN = "Unable to extract the title."
}

View file

@ -1,12 +1,12 @@
package com.voronind.doublegis.test package com.voronind.doublegis.test
package model.handler package model.handler
import model.crawler.TitleCrawler
import model.lib.{HttpUtil, StreamUtil} import model.lib.{HttpUtil, StreamUtil}
import com.sun.net.httpserver.{HttpExchange, HttpHandler} import com.sun.net.httpserver.{HttpExchange, HttpHandler}
import com.voronind.doublegis.test.model.crawler.TitleCrawler
import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader} import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader}
import scala.language.postfixOps import scala.language.postfixOps
/** /**
@ -28,20 +28,20 @@ class TitleCrawlerHandler extends HttpHandler, Handler {
// I don't know if this one is ugly, but I wanted to show off a bit. // I don't know if this one is ugly, but I wanted to show off a bit.
extension (exchange: HttpExchange) private def sendResponse(request: Array[Byte]): Unit = { extension (exchange: HttpExchange) private def sendResponse(request: Array[Byte]): Unit = {
val reader = new BufferedReader(new InputStreamReader(ByteArrayInputStream(request))) val reader = new BufferedReader(new InputStreamReader(ByteArrayInputStream(request)))
val response = Iterator
val result = Iterator
.continually(reader.readLine) .continually(reader.readLine)
.takeWhile(null !=) .takeWhile(null !=)
.filter(HttpUtil.isUrl) .filter(HttpUtil.isUrl)
.map({ url => f"$url => ${runCrawler(url)}" }) .map({ url => s"$url => ${runCrawler(url)}" })
.toList .mkString("\n")
.getBytes()
val response = result.mkString("\n") exchange.sendResponseHeaders(200, response.length)
exchange.sendResponseHeaders(200, response.length())
val out = exchange.getResponseBody val output = exchange.getResponseBody
out.write(response.getBytes)
out.close() StreamUtil.copyStream(new ByteArrayInputStream(response), output)
output.close()
} }
private def runCrawler(url: String): String = { private def runCrawler(url: String): String = {