Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added cache database and User-Agent HTTP header #2

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 13 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
@@ -1,2 +1,14 @@
# 68k-news
Source for the 68k.news site
Source for the 68k.news site

---

I added an SQLite3 cache database in which the articles are stored for 24h. After that period they automatically get deleted.

The database can be disabled by changing the `USE_CACHE` define in `article.php`.

If the database file becomes corrupted, the program will try to delete and recreate it. (This can be turned off by setting `RECREATE_ON_FAIL` to `false` in `cache_database.php`).

The default cache freshness lifetime is 24h, however it can be changed using the `MAX_CACHE_TIME` parameter in `cache_database.php` along with the default cache database filename (`cache.db`).

I also had to add a spoofed User-Agent header, because some articles couldn't load without it.
60 changes: 45 additions & 15 deletions article.php
Original file line number Diff line number Diff line change
@@ -1,10 +1,18 @@
<?php
define('USE_CACHE', true);

require_once('vendor/autoload.php');
if(USE_CACHE) require_once('cache_database.php');

$article_url = "";
$article_html = "";
$error_text = "";
$readable_article = "";
$article_images = array();
$article_title = "";
$loc = "US";
$from_cache = USE_CACHE;
$database = null;

if( isset( $_GET['loc'] ) ) {
$loc = strtoupper($_GET["loc"]);
Expand Down Expand Up @@ -32,20 +40,42 @@

$readability = new Readability($configuration);

if(!$article_html = file_get_contents($article_url)) {
$error_text .= "Failed to get the article :( <br>";
if($from_cache){
$database = CacheDatabase::getInstance();
$cachedArticle = is_null($database) ? null : $database->getFromCache($article_url);
if(!is_null($cachedArticle)){
$article_title = $cachedArticle[0];
$readable_article = $cachedArticle[1];
$article_images = $cachedArticle[2];
}else $from_cache = false;
}

try {
$readability->parse($article_html);
$readable_article = strip_tags($readability->getContent(), '<ol><ul><li><br><p><small><font><b><strong><i><em><blockquote><h1><h2><h3><h4><h5><h6>');
$readable_article = str_replace( 'strong>', 'b>', $readable_article ); //change <strong> to <b>
$readable_article = str_replace( 'em>', 'i>', $readable_article ); //change <em> to <i>

$readable_article = clean_str($readable_article);

} catch (ParseException $e) {
$error_text .= 'Sorry! ' . $e->getMessage() . '<br>';
if(!$from_cache){
$opts = array(
'http'=>array(
'method'=>"GET",
'header'=>"User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:87.0) Gecko/20100101 Firefox/87.0\r\n"
)
);


if(!$article_html = file_get_contents($article_url, false, stream_context_create($opts))) {
$error_text .= "Failed to get the article :( <br>";
}

try {
$readability->parse($article_html);
$readable_article = strip_tags($readability->getContent(), '<ol><ul><li><br><p><small><font><b><strong><i><em><blockquote><h1><h2><h3><h4><h5><h6>');
$readable_article = str_replace( 'strong>', 'b>', $readable_article ); //change <strong> to <b>
$readable_article = str_replace( 'em>', 'i>', $readable_article ); //change <em> to <i>

$readable_article = clean_str($readable_article);
$article_title = $readability->getTitle();
$article_images = $readability->getImages();
if(!is_null($database)) $database->writeToCache($article_url, $article_title, $readable_article, $article_images);
} catch (ParseException $e) {
$error_text .= 'Sorry! ' . $e->getMessage() . '<br>';
}
}

//replace chars that old machines probably can't handle
Expand All @@ -64,15 +94,15 @@ function clean_str($str) {

<html>
<head>
<title><?php echo $readability->getTitle();?></title>
<title><?php echo $article_title;?></title>
</head>
<body>
<small><a href="/index.php?loc=<?php echo $loc ?>">< Back to <font color="#9400d3">68k.news</font> <?php echo $loc ?> front page</a></small>
<h1><?php echo clean_str($readability->getTitle());?></h1>
<h1><?php echo clean_str($article_title);?></h1>
<p><small><a href="<?php echo $article_url ?>" target="_blank">Original source</a> (on modern site) <?php
$img_num = 0;
$imgline_html = "| Article images:";
foreach ($readability->getImages() as $image_url):
foreach ($article_images as $image_url):
//we can only do png and jpg
if (strpos($image_url, ".jpg") || strpos($image_url, ".jpeg") || strpos($image_url, ".png") === true) {
$img_num++;
Expand Down
62 changes: 62 additions & 0 deletions cache_database.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
<?php
define('MAX_CACHE_TIME', 86400); //24 hours in seconds
define('RECREATE_ON_FAIL', true);
define('CACHE_DATABASE', "cache.db");

class CacheDatabase extends SQLite3{
private static $instance = null;

private function __construct($path){
try{
parent::__construct($path);
$this->exec("CREATE TABLE IF NOT EXISTS cache(id INTEGER PRIMARY KEY AUTOINCREMENT, url TEXT UNIQUE, epoch INTEGER, title TEXT, content BLOB, images TEXT)");
}catch(Exception $ex){
trigger_error("Error opening database: " . $ex->getMessage());
}
}

public function getFromCache($url){
if(!$statement = $this->prepare("SELECT content, title, images FROM cache WHERE url = ?")){
return null;
}
$statement->bindParam(1, $url);
if(!$result = $statement->execute()) return null;
if(!$row = $result->fetchArray(SQLITE3_ASSOC)) return null;
return array(
0 => $row['title'],
1 => $row['content'],
2 => unserialize($row['images'])
);
}

public function writeToCache($url, $title, $content, $images){
if($cleanStatement = $this->prepare("DELETE FROM cache WHERE epoch < ?")){
$cleanStatement->bindValue(1, time() - MAX_CACHE_TIME);
$cleanStatement->execute();
}
if($statement = $this->prepare("INSERT INTO cache (url, epoch, title, content, images) VALUES (?, ?, ?, ?, ?)")){
$statement->bindValue(1, $url);
$statement->bindValue(2, time());
$statement->bindValue(3, $title);
$statement->bindValue(4, $content);
$statement->bindValue(5, serialize($images));
$statement->execute();
}else{
trigger_error("Cache database writing error");
if(RECREATE_ON_FAIL){
trigger_error("Deleting database");
$this->close();
unlink(CACHE_DATABASE);
self::$instance = new CacheDatabase(CACHE_DATABASE);
}
}
}

public static function getInstance(){
if(is_null(self::$instance)){
self::$instance = new CacheDatabase(CACHE_DATABASE);
}
return self::$instance;
}
}
?>