TitleCrawler : Use scala-scraper.

This commit is contained in:
Dmitry Voronin 2024-09-23 18:15:50 +03:00
parent add30216b5
commit 57c4031377
Signed by: voronind
SSH key fingerprint: SHA256:3kBb4iV2ahufEBNq+vFbUe4QYfHt98DHQjN7QaptY9k
2 changed files with 25 additions and 12 deletions

View file

@ -1,11 +1,24 @@
package com.voronind.doublegis.test
package model.crawler
import model.crawler.TitleCrawler.TITLE_UNKNOWN
import net.ruippeixotog.scalascraper.browser.JsoupBrowser
/**
* A crawler that extracts Http Head Title.
*/
class TitleCrawler extends Crawler {
override def crawl(url: String): String = {
"Mock title."
try {
val browser = JsoupBrowser()
val html = browser.get(url)
html.title
} catch
case e: Exception => TITLE_UNKNOWN
}
}
object TitleCrawler {
private val TITLE_UNKNOWN = "Unable to extract the title."
}

View file

@ -1,12 +1,12 @@
package com.voronind.doublegis.test
package model.handler
import model.crawler.TitleCrawler
import model.lib.{HttpUtil, StreamUtil}
import com.sun.net.httpserver.{HttpExchange, HttpHandler}
import com.voronind.doublegis.test.model.crawler.TitleCrawler
import java.io.{BufferedReader, ByteArrayInputStream, InputStream, InputStreamReader}
import java.io.{BufferedReader, ByteArrayInputStream, InputStreamReader}
import scala.language.postfixOps
/**
@ -28,20 +28,20 @@ class TitleCrawlerHandler extends HttpHandler, Handler {
// I don't know if this one is ugly, but I wanted to show off a bit.
extension (exchange: HttpExchange) private def sendResponse(request: Array[Byte]): Unit = {
val reader = new BufferedReader(new InputStreamReader(ByteArrayInputStream(request)))
val result = Iterator
val response = Iterator
.continually(reader.readLine)
.takeWhile(null !=)
.filter(HttpUtil.isUrl)
.map({ url => f"$url => ${runCrawler(url)}" })
.toList
.map({ url => s"$url => ${runCrawler(url)}" })
.mkString("\n")
.getBytes()
val response = result.mkString("\n")
exchange.sendResponseHeaders(200, response.length())
exchange.sendResponseHeaders(200, response.length)
val out = exchange.getResponseBody
out.write(response.getBytes)
out.close()
val output = exchange.getResponseBody
StreamUtil.copyStream(new ByteArrayInputStream(response), output)
output.close()
}
private def runCrawler(url: String): String = {