爬蟲 API 服務規範 (Laravel 10 + JWT):
1. API 規範:
- 身份驗證:
- 使用
tymon/jwt-auth
套件實現 JWT 認證。 - 使用 Laravel Gates 或 Policies 進行授權。
- 使用
- API 特性:
- RESTful 設計,使用 HTTP 方法 (POST, GET)。
- GET 端點具備冪等性。
- 端點:
POST /auth/login
:- 使用者登入,返回 JWT。
POST /crawl/single
:- 爬取單個頁面 (需要 JWT)。
POST /crawl/website
:- 爬取整個網站 (需要 JWT)。
GET /crawl/{id}
:- 檢索爬取結果 (需要 JWT)。
POST /crawl/{id}/suspend
:- 暫停爬取任務 (需要 JWT)。
- 輸入參數:
URL
(必填):要爬取的 URL。depth
(可選):爬取深度 (預設為 1)。rules
(可選):數據提取規則 (JSON 格式)。
- 輸出格式:
- JSON 響應 (爬取結果或錯誤訊息)。
2. Laravel 10 實作:
- 套件安裝:
composer require tymon/jwt-auth
php artisan jwt:secret
php artisan vendor:publish --provider="Tymon\JWTAuth\Providers\LaravelServiceProvider"
- 路由 (routes/api.php):
PHP
use App\Http\Controllers\AuthController;
use App\Http\Controllers\CrawlController;
use Illuminate\Support\Facades\Route;
Route::post('/auth/login', [AuthController::class, 'login']);
Route::middleware('jwt.auth')->group(function () {
Route::post('/crawl/single', [CrawlController::class, 'crawlSingle']);
Route::post('/crawl/website', [CrawlController::class, 'crawlWebsite']);
Route::get('/crawl/{id}', [CrawlController::class, 'getCrawlResult']);
Route::post('/crawl/{id}/suspend', [CrawlController::class, 'suspendCrawl']);
});
- 身份驗證控制器 (AuthController.php):
PHP
namespace App\Http\Controllers;
use Illuminate\Http\Request;
use Illuminate\Support\Facades\Auth;
use Tymon\JWTAuth\Facades\JWTAuth;
class AuthController extends Controller
{
public function login(Request $request)
{
$credentials = $request->only('email', 'password');
if (!$token = JWTAuth::attempt($credentials)) {
return response()->json(['error' => 'Unauthorized'], 401);
}
return $this->respondWithToken($token);
}
protected function respondWithToken($token)
{
return response()->json([
'access_token' => $token,
'token_type' => 'bearer',
'expires_in' => JWTAuth::factory()->getTTL() * 60,
]);
}
}
- 爬蟲控制器 (CrawlController.php):PHP
<?php namespace App\Http\Controllers; use App\Jobs\CrawlJob; use App\Models\CrawlTask; use Illuminate\Http\Request; use Illuminate\Support\Facades\Queue; use Illuminate\Support\Facades\Validator; use Illuminate\Support\Facades\Auth; use GuzzleHttp\Client; use Symfony\Component\DomCrawler\Crawler; class CrawlController extends Controller { /** * 爬取單個頁面。 * * @param \Illuminate\Http\Request $request * @return \Illuminate\Http\JsonResponse */ public function crawlSingle(Request $request) { // 1. 驗證請求參數 $validator = Validator::make($request->all(), [ 'url' => 'required|url', 'depth' => 'integer|min:1', 'rules' => 'nullable|json', ]); if ($validator->fails()) { return response()->json(['errors' => $validator->errors()], 400); } // 2. 建立爬取任務 $crawlTask = CrawlTask::create([ 'url' => $request->url, 'depth' => $request->depth ?? 1, // 如果 depth 未提供,則使用預設值 1 'rules' => $request->rules, 'status' => 'pending', ]); // 3. 將爬取任務放入隊列 Queue::push(new CrawlJob($crawlTask)); // 4. 返回成功響應 return response()->json(['message' => 'Crawl task started', 'id' => $crawlTask->id]); } /** * 爬取整個網站。 * * @param \Illuminate\Http\Request $request * @return \Illuminate\Http\JsonResponse */ public function crawlWebsite(Request $request) { // 1. 驗證請求參數 $validator = Validator::make($request->all(), [ 'url' => 'required|url', 'depth' => 'integer|min:1', 'rules' => 'nullable|json', ]); if ($validator->fails()) { return response()->json(['errors' => $validator->errors()], 400); } // 2. 建立爬取任務 $crawlTask = CrawlTask::create([ 'url' => $request->url, 'depth' => $request->depth ?? 1, 'rules' => $request->rules, 'status' => 'pending', ]); // 3. 將起始任務放入隊列 $this->enqueueWebsiteCrawl($crawlTask); // 4. 返回成功響應 return response()->json(['message' => 'Website crawl task started', 'id' => $crawlTask->id]); } /** * 遞迴地將網站頁面加入隊列。 * * @param \App\Models\CrawlTask $crawlTask * @param int $currentDepth * @return void */ private function enqueueWebsiteCrawl(CrawlTask $crawlTask, $currentDepth = 0) { if ($currentDepth > $crawlTask->depth) { return; } Queue::push(new CrawlJob($crawlTask)); if ($currentDepth < $crawlTask->depth) { $client = new Client(); try { $response = $client->get($crawlTask->url); $html = $response->getBody()->getContents(); $crawler = new Crawler($html, $crawlTask->url); $links = $crawler->filter('a')->each(function (Crawler $node) { return $node->attr('href'); }); foreach ($links as $link) { $absoluteLink = $this->makeAbsoluteUrl($link, $crawlTask->url); if ($this->isSameDomain($absoluteLink, $crawlTask->url) && $absoluteLink) { $newTask = CrawlTask::create([ 'url' => $absoluteLink, 'depth' => $crawlTask->depth, 'rules' => $crawlTask->rules, 'status' => 'pending', ]); $this->enqueueWebsiteCrawl($newTask, $currentDepth + 1); } } } catch (\Exception $e) { // 處理爬取錯誤 } } } /** * 將相對 URL 轉換為絕對 URL。 * * @param string $link * @param string $baseUrl * @return string|null */ private function makeAbsoluteUrl($link, $baseUrl) { $parsedUrl = parse_url($baseUrl); if (strpos($link, '//') === 0) { return $parsedUrl['scheme'] . ':' . $link; } elseif (strpos($link, '/') === 0) { return $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . $link; } elseif (strpos($link, 'http') === 0) { return $link; } else { return $parsedUrl['scheme'] . '://' . $parsedUrl['host'] . rtrim(dirname($parsedUrl['path']), '/') . '/' . $link; } return null; } /** * 檢查 URL 是否為相同網域。 * * @param string $url * @param string $baseUrl * @return bool */ private function isSameDomain($url, $baseUrl) { $parsedUrl = parse_url($url); $parsedBaseUrl = parse_url($baseUrl); return isset($parsedUrl['host']) && isset($parsedBaseUrl['host']) && $parsedUrl['host'] === $parsedBaseUrl['host']; } /** * 檢索爬取結果。 * * @param int $id * @return \Illuminate\Http\JsonResponse */ public function getCrawlResult($id) { // 1. 根據 ID 查找爬取任務 $crawlTask = CrawlTask::find($id); // 2. 檢查爬取任務是否存在 if (!$crawlTask) { return response()->json(['message' => 'Crawl task not found'], 404); } // 3. 返回爬取結果 return response()->json($crawlTask->result); } /** * 暫停爬取任務。 * * @param int $id * @return \Illuminate\Http\JsonResponse */ public function suspendCrawl($id) { // 1. 根據 ID 查找爬取任務 $crawlTask = CrawlTask::find($id); // 2. 檢查爬取任務是否存在 if (!$crawlTask) { return response()->json(['message' => 'Crawl task not found'], 404); } // 3. 更新爬取任務狀態為 'suspended' $crawlTask->update(['status' => 'suspended']); // 4. 返回成功響應 return response()->json(['message' => 'Crawl task suspended']); } }
程式碼說明:
crawlWebsite()
方法:- 驗證請求參數。
- 建立
CrawlTask
任務。 - 調用
enqueueWebsiteCrawl()
方法,將起始任務放入隊列,並遞迴地添加相關任務。
enqueueWebsiteCrawl()
方法:- 遞迴地將網站頁面加入隊列。
- 如果當前深度超過設定深度,則停止遞迴。
- 將當前任務放入隊列。
- 如果當前深度小於設定深度,則:
- 使用 Guzzle 發送 HTTP 請求,獲取網頁內容。
- 使用 Symfony DomCrawler 解析 HTML,提取所有連結。
- 過濾連結,只保留相同網域的內部連結。
- 為每個連結建立新的
CrawlTask
任務,並遞迴調用enqueueWebsiteCrawl()
方法
- 模型 (Models):
- CrawlTask.php:
PHP
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class CrawlTask extends Model
{
use HasFactory;
protected $fillable = ['url', 'depth', 'rules', 'status', 'result'];
protected $casts = [
'rules' => 'array',
'result' => 'array',
];
public function crawledPages()
{
return $this->hasMany(CrawledPage::class);
}
}
* **CrawledPage.php:**
PHP
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class CrawledPage extends Model
{
use HasFactory;
protected $fillable = ['crawl_task_id', 'url', 'title', 'content', 'http_status', 'crawled_at'];
public function crawlTask()
{
return $this->belongsTo(CrawlTask::class);
}
public function crawledImages()
{
return $this->hasMany(CrawledImage::class);
}
public function crawledVideos()
{
return $this->hasMany(CrawledVideo::class);
}
public function metadata()
{
return $this->hasMany(Metadata::class);
}
}
* **CrawledImage.php:**
PHP
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class CrawledImage extends Model
{
use HasFactory;
protected $fillable = ['crawled_page_id', 'url', 'file_path', 'file_size'];
public function crawledPage()
{
return $this->belongsTo(CrawledPage::class);
}
}
* **CrawledVideo.php:**
PHP
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class CrawledVideo extends Model
{
use HasFactory;
protected $fillable = ['crawled_page_id', 'url', 'file_path', 'file_size'];
public function crawledPage()
{
return $this->belongsTo(CrawledPage::class);
}
}
* **Metadata.php:**
PHP
namespace App\Models;
use Illuminate\Database\Eloquent\Factories\HasFactory;
use Illuminate\Database\Eloquent\Model;
class Metadata extends Model
{
use HasFactory;
protected $fillable = ['crawled_page_id', 'key', 'value'];
public function crawledPage()
{
return $this->belongsTo(CrawledPage::class);
}
}
隊列任務 (Jobs):CrawlJob.php
PHP<?php namespace App\Jobs; use App\Models\CrawlTask; use App\Models\CrawledPage; use App\Models\CrawledImage; use App\Models\CrawledVideo; use App\Models\Metadata; use GuzzleHttp\Client; use Illuminate\Bus\Queueable; use Illuminate\Contracts\Queue\ShouldQueue; use Illuminate\Foundation\Bus\Dispatchable; use Illuminate\Queue\InteractsWithQueue; use Illuminate\Queue\SerializesModels; use Symfony\Component\DomCrawler\Crawler; use Illuminate\Support\Facades\Storage; use Illuminate\Support\Str; class CrawlJob implements ShouldQueue { use Dispatchable, InteractsWithQueue, Queueable, SerializesModels; protected $crawlTask; public function __construct(CrawlTask $crawlTask) { $this->crawlTask = $crawlTask; } public function handle() { $client = new Client(); try { $response = $client->get($this->crawlTask->url); $statusCode = $response->getStatusCode(); $html = $response->getBody()->getContents(); $crawler = new Crawler($html, $this->crawlTask->url); // 儲存爬取的頁面 $crawledPage = $this->crawlTask->crawledPages()->create([ 'url' => $this->crawlTask->url, 'title' => $crawler->filter('title')->text(), 'content' => $html, 'http_status' => $statusCode, 'crawled_at' => now(), ]); // 解析 HTML 內容並儲存相關資料 (例如: images, videos, metadata) $this->extractAndStoreData($crawler, $crawledPage); $this->crawlTask->update(['status' => 'completed']); } catch (\Exception $e) { // 處理爬取錯誤 $this->crawlTask->update(['status' => 'failed', 'result' => ['error' => $e->getMessage()]]); } } protected function extractAndStoreData(Crawler $crawler, CrawledPage $crawledPage) { // 提取圖片 $crawler->filter('img')->each(function (Crawler $node) use ($crawledPage) { $imageUrl = $node->attr('src'); $this->downloadAndStoreImage($imageUrl, $crawledPage); }); // 提取影片 $crawler->filter('video source')->each(function (Crawler $node) use ($crawledPage) { $videoUrl = $node->attr('src'); $this->downloadAndStoreVideo($videoUrl, $crawledPage); }); // 提取元數據 (例如: meta tags) $crawler->filter('meta')->each(function (Crawler $node) use ($crawledPage) { $name = $node->attr('name'); $content = $node->attr('content'); if ($name && $content) { $crawledPage->metadata()->create(['key' => $name, 'value' => $content]); } }); // 根據 $this->crawlTask->rules 提取數據 if ($this->crawlTask->rules) { $rules = json_decode($this->crawlTask->rules, true); foreach ($rules as $rule) { $crawler->filter($rule['selector'])->each(function (Crawler $node) use ($crawledPage, $rule) { $value = $node->text(); // 或其他提取方式 $crawledPage->metadata()->create(['key' => $rule['name'], 'value' => $value]); }); } } } protected function downloadAndStoreImage($imageUrl, CrawledPage $crawledPage) { try { $client = new Client(); $response = $client->get($imageUrl); $contentType = $response->getHeaderLine('Content-Type'); if (strpos($contentType, 'image/') === 0) { $imageContent = $response->getBody()->getContents(); $fileName = Str::random(40) . '.' . pathinfo($imageUrl, PATHINFO_EXTENSION); Storage::disk('public')->put('images/' . $fileName, $imageContent); $crawledPage->crawledImages()->create([ 'url' => $imageUrl, 'file_path' => 'images/' . $fileName, 'file_size' => strlen($imageContent), ]); } } catch (\Exception $e) { // 處理圖片下載錯誤 } } protected function downloadAndStoreVideo($videoUrl, CrawledPage $crawledPage) { try { $client = new Client(); $response = $client->get($videoUrl); $contentType = $response->getHeaderLine('Content-Type'); if (strpos($contentType, 'video/') === 0) { $videoContent = $response->getBody()->getContents(); $fileName = Str::random(40) . '.' . pathinfo($videoUrl, PATHINFO_EXTENSION); Storage::disk('public')->put('videos/' . $fileName, $videoContent); $crawledPage->crawledVideos()->create([ 'url' => $videoUrl, 'file_path' => 'videos/' . $fileName, 'file_size' => strlen($videoContent), ]); } } catch (\Exception $e) { // 處理影片下載錯誤 } } }
程式碼說明:
downloadAndStoreImage()
方法:- 使用 Guzzle 下載圖片。
- 檢查
Content-Type
標頭,確保是圖片。 - 使用
Storage::disk('public')->put()
方法將圖片儲存到storage/app/public/images
目錄。 - 將圖片資訊儲存到
crawled_images
表。
downloadAndStoreVideo()
方法:- 與
downloadAndStoreImage()
方法類似,但用於下載和儲存影片。 - 影片儲存到
storage/app/public/videos
目錄。 - 將影片資訊儲存到
crawled_videos
表。
- 與
extractAndStoreData()
方法:- 調用
downloadAndStoreImage()
和downloadAndStoreVideo()
方法,下載和儲存圖片和影片。 - 其他部分與之前的程式碼相同。
- 調用
注意事項:
- 請確保
storage/app/public
目錄具有寫入權限。 - 請根據您的需求,修改圖片和影片的儲存路徑和檔案名稱。
- 如果需要處理其他類型的檔案,可以新增相應的下載和儲存方法。
- 錯誤處理的部分,可以加入紀錄log的程式碼,方便除錯。
沒有留言:
張貼留言