-
Notifications
You must be signed in to change notification settings - Fork 8
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
15 changed files
with
993 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,80 @@ | ||
<?php | ||
namespace Xupopter\Providers; | ||
|
||
use Xupopter\System\Provider; | ||
use Xupopter\System\IProvider; | ||
|
||
class Fotocasa extends Provider implements IProvider | ||
{ | ||
private $domain = "http://www.fotocasa.es"; | ||
|
||
public function crawl ($path) | ||
{ | ||
$q = $this->getContent($this->domain . $path); | ||
|
||
foreach ($q->find('#search-listing tr.expanded') as $data) | ||
{ | ||
$item = $this->parseItem($this->getContent($data->attr("data-url"))); | ||
|
||
if ($item) { | ||
$this->sendToDB($item); | ||
} | ||
} | ||
} | ||
|
||
|
||
/** | ||
* Converts provider output to db's input format | ||
* | ||
* @param QueryPath $html | ||
* | ||
* @return mixed (array/boolean) | ||
*/ | ||
public function parseItem ($html) | ||
{ | ||
$images = []; | ||
|
||
/* | ||
transform http://a.ftcs.es/inmesp/anuncio/2015/04/03/135151707/253141017.jpg/w_0/c_690x518/p_1/ | ||
to http://a.ftcs.es/inmesp/anuncio/2015/04/03/135151707/253141017.jpg | ||
*/ | ||
foreach ($html->find('#containerSlider img') as $img) | ||
{ | ||
$src = $img->attr("data-src"); | ||
|
||
if (empty($src)) { | ||
$src = $img->attr("src"); | ||
} | ||
|
||
$path = explode(".jpg", $src); | ||
$images[] = $path[0] . ".jpg"; | ||
} | ||
|
||
$data = [ | ||
'title' => trim($html->find('.property-title')->text()), | ||
'description' => trim($html->find('#ctl00_ddDescription .detail-section-content')->text()), | ||
'images' => $images, | ||
'location' => trim($html->find('.section.section--noBorder .detail-section-content')->text()), | ||
'price' => $this->strToNumber($html->find('#priceContainer')->text()), | ||
'meters' => $this->strToNumber($html->find('#litSurface b')->text()), | ||
'floor' => (int)$html->find('#litFloor')->text(), | ||
'url' => $html->find('link[rel="canonical"]')->attr("href") | ||
]; | ||
|
||
foreach ($html->find('.detail-extras li') as $li) { | ||
$text = trim($li->text()); | ||
switch ($text) { | ||
case "Ascensor": | ||
$data["elevator"] = true; | ||
break; | ||
} | ||
} | ||
|
||
if ($data["meters"] == 0 || empty($data["description"])) { | ||
return false; | ||
} | ||
|
||
return $data; | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,127 @@ | ||
<?php | ||
namespace Xupopter\Providers; | ||
|
||
use Xupopter\System\Provider; | ||
use Xupopter\System\IProvider; | ||
|
||
class Habitaclia extends Provider implements IProvider | ||
{ | ||
private $domain = "http://www.habitaclia.com"; | ||
public $minResults = 500; // min crawled vod content | ||
|
||
public function crawl ($path) | ||
{ | ||
$q = $this->getContent($this->domain . $path); | ||
|
||
foreach ($q->find('#listaAds li a[itemprop=name]') as $data) | ||
{ | ||
$item = $this->parseItem($this->getContent($data->attr("href"))); | ||
|
||
if ($item) { | ||
$this->sendToDB($item); | ||
} | ||
} | ||
} | ||
|
||
private function stringToBool ($str) | ||
{ | ||
switch ($str) | ||
{ | ||
case "Sí": | ||
return true; | ||
break; | ||
} | ||
return false; | ||
} | ||
|
||
/** | ||
* Converts provider output to db's input format | ||
* | ||
* @param QueryPath $html | ||
* | ||
* @return mixed (array/boolean) | ||
*/ | ||
public function parseItem ($html) | ||
{ | ||
$images = []; | ||
|
||
// doesnt have images or price | ||
if (!empty($html->find('.cajon-pedir-foto')->text()) || !empty($html->find('.pvpdesde')->text())) { | ||
return false; | ||
} | ||
|
||
$location = trim(preg_replace('/(\v|\s)+/', ' ', $html->find('.dir_ex.sprite')->text())); | ||
$description = trim($html->find('[itemprop="description"] p')->text()); | ||
|
||
$data = [ | ||
'title' => $html->find('.h1ficha')->text(), | ||
'location' => $location, | ||
'description' => $description, | ||
'url' => $html->find('link[rel="canonical"]')->attr("href"), | ||
"price" => $this->strToNumber($html->find('[itemprop="price"]')->text()) | ||
]; | ||
|
||
$lastUpdate = trim($html->find('.actualizado.radius')->text()); | ||
|
||
preg_match("/\(([0-9\/]+)\)/", $lastUpdate, $matches); | ||
if (isset($matches[1])) { | ||
$data["lastUpdate"] = $matches[1]; | ||
} | ||
|
||
foreach ($html->find('#inificha .bodis ul li') as $li) | ||
{ | ||
$text = $li->text(); | ||
|
||
if (strpos($text, " m2") !== false) { | ||
$data["meters"] = $this->strToNumber($li->find("span")->text()); | ||
} else if (strpos($text, "habitaciones") !== false) { | ||
$data["rooms"] = (int)$text; | ||
} | ||
} | ||
|
||
foreach ($html->find('.caracteristicas li') as $li) | ||
{ | ||
$text = $li->text(); | ||
|
||
if (strpos($text, ":") === false) { | ||
continue; | ||
} | ||
|
||
$info = explode(":", $text); | ||
|
||
switch (trim($info[0])) | ||
{ | ||
case "Número de planta": | ||
$data["floor"] = (int)$info[1]; | ||
break; | ||
case "Aire acondicionado": | ||
$data["airConditioner"] = $this->stringToBool(trim($info[1])); | ||
break; | ||
case "Calefacción": | ||
$data["heating"] = $this->stringToBool(trim($info[1])); | ||
break; | ||
case "Parking": | ||
$data["parking"] = $this->stringToBool(trim($info[1])); | ||
break; | ||
case "Ascensor": | ||
$data["elevator"] = $this->stringToBool(trim($info[1])); | ||
break; | ||
case "Amueblado": | ||
$data["furnished"] = $this->stringToBool(trim($info[1])); | ||
break; | ||
} | ||
} | ||
|
||
foreach ($html->find(".ficha_foto img") as $img) | ||
{ | ||
$image = str_replace("G.jpg", "XL.jpg", $img->attr("src")); | ||
$images[] = $image; | ||
} | ||
|
||
if (sizeof($images) > 0) { | ||
$data["images"] = $images; | ||
} | ||
|
||
return $data; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
<?php | ||
namespace Xupopter\Providers; | ||
|
||
use Xupopter\System\Provider; | ||
use Xupopter\System\IProvider; | ||
|
||
class Idealista extends Provider implements IProvider | ||
{ | ||
private $domain = "http://www.idealista.com"; | ||
|
||
public function crawl ($path) | ||
{ | ||
$q = $this->getContent($this->domain . $path); | ||
|
||
foreach ($q->find('.item-link') as $data) | ||
{ | ||
$item = $this->parseItem($this->getContent($this->domain . $data->attr("href"))); | ||
|
||
if ($item) { | ||
$this->sendToDB($item); | ||
} | ||
} | ||
} | ||
|
||
|
||
/** | ||
* Converts provider output to db's input format | ||
* | ||
* @param QueryPath $html | ||
* | ||
* @return mixed (array/boolean) | ||
*/ | ||
public function parseItem ($html) | ||
{ | ||
$images = []; | ||
|
||
// get ch var from og image (required to display the images) | ||
$ogImage = $html->find('[name="og:image"]')->attr("content"); | ||
|
||
if (empty($ogImage)) { | ||
return false; | ||
} | ||
|
||
parse_str(parse_url($ogImage)["query"], $query); | ||
$imageCh = $query["ch"]; | ||
|
||
/* | ||
transform http://img3.idealista.com/thumbs,W,H,wi,+tSLyO%2BcnvWFQ1vfQ1%2FQRH6EBc9TEzAKu5PmhgV%2 | ||
to http://img3.idealista.com/thumbs?wi=1500&he=0&en=%2BtSLyO%2BcnvWFQ1vfQ1%2FQRH6EBc9TEzAKu5PmhgV%2&ch=2106166706 | ||
*/ | ||
foreach ($html->find('#main-multimedia img') as $img) { | ||
$image = str_replace("http://img3.idealista.com/thumbs,W,H,wi,+", "", $img->attr("data-service")); | ||
|
||
$images[] = "http://img3.idealista.com/thumbs?wi=1500&he=0&en=%2B" . urlencode($image) . "&ch=" . $imageCh; | ||
} | ||
|
||
$title = trim($html->find('h1.txt-bold span')->text()); | ||
$location = str_replace("Piso en venta en ", "", $title); | ||
$location = str_replace("Piso en alquiler en ", "", $location); | ||
|
||
$data = [ | ||
'title' => $title, | ||
'description' => trim($html->find('.adCommentsLanguage.expandable')->text()), | ||
'images' => $images, | ||
'location' => $location, | ||
'price' => $this->strToNumber($html->find('#main-info .txt-big.txt-bold')->eq(0)->text()), | ||
'url' => $html->find('#share-link')->attr("href") | ||
]; | ||
|
||
foreach ($html->find('#fixed-toolbar .info-data > span') as $item) | ||
{ | ||
$text = $item->text(); | ||
|
||
$this->parseHouseInfo($text, $data); | ||
} | ||
|
||
if (!isset($data["meters"]) || $data["meters"] == 0 || empty($data["description"])) { | ||
return false; | ||
} | ||
|
||
return $data; | ||
|
||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,101 @@ | ||
<?php | ||
namespace Xupopter\Providers; | ||
|
||
use Xupopter\System\Provider; | ||
use Xupopter\System\IProvider; | ||
|
||
class Pisos extends Provider implements IProvider | ||
{ | ||
private $domain = "http://www.pisos.com"; | ||
private $itemProps = [ | ||
"postalCode", | ||
"latitude", | ||
"longitude", | ||
]; | ||
|
||
public function crawl ($path) | ||
{ | ||
$q = $this->getContent($this->domain . $path); | ||
|
||
foreach ($q->find('[itemprop="photo"] [itemprop="url"]') as $data) | ||
{ | ||
$item = $this->parseItem($this->getContent($this->domain . $data->attr("content"))); | ||
|
||
if ($item) { | ||
$this->sendToDB($item); | ||
} | ||
} | ||
} | ||
|
||
/** | ||
* Converts provider output to db's input format | ||
* | ||
* @param QueryPath $html | ||
* | ||
* @return mixed (array/boolean) | ||
*/ | ||
public function parseItem ($html) | ||
{ | ||
$images = []; | ||
$data = [ | ||
'title' => trim($html->find('h1.title')->text()), | ||
'description' => trim($html->find('.description')->text()), | ||
'price' => $this->strToNumber($html->find('.jsPrecioH1')->eq(0)->text()), | ||
'url' => $html->find('link[rel="canonical"]')->attr("href") | ||
]; | ||
|
||
foreach ($this->itemProps as $prop) | ||
{ | ||
$propVal = $html->find('[itemprop="' . $prop . '"]')->attr("content"); | ||
|
||
if (!empty($propVal)) { | ||
$data[$prop] = $propVal; | ||
} | ||
} | ||
|
||
// try to get the exact address | ||
$location = $html->find('[itemprop="streetAddress"]')->attr("content"); | ||
|
||
if (empty($location)) { | ||
$location = $html->find('meta[itemprop="name"]')->attr("content"); | ||
$location = str_replace("Piso en venta en ", "", $location); | ||
$location = str_replace("Piso en alquiler en ", "", $location); | ||
} | ||
|
||
$data['location'] = $location . ", " . $html->find('h2.position')->text(); | ||
|
||
foreach ($html->find('.characteristics .item') as $item) | ||
{ | ||
$text = $item->text(); | ||
|
||
$this->parseHouseInfo($text, $data); | ||
} | ||
|
||
// skip retards that dont even fill the apartment meters | ||
if (!isset($data["meters"]) || $data["meters"] < 1) { | ||
return false; | ||
} | ||
|
||
/* | ||
from http://fotos.imghs.net/s/1030/129/1030_27926263129_1_2015112416580031250.jpg | ||
to http://fotos.imghs.net/xl/1030/129/1030_27926263129_1_2015112416580031250.jpg | ||
*/ | ||
foreach ($html->find("#basic img") as $img) | ||
{ | ||
$image = str_replace(".net/s/", ".net/xl/", $img->attr("src")); | ||
|
||
// skip the default photos | ||
if (strpos($image, "nofoto_mini.jpg") !== false || strpos($image, "blank1x1.png") !== false || strpos($image, "Images/assets") !== false) { | ||
continue; | ||
} | ||
|
||
$images[] = $image; | ||
} | ||
|
||
if (sizeof($images) > 0) { | ||
$data["images"] = $images; | ||
} | ||
|
||
return $data; | ||
} | ||
} |
Oops, something went wrong.