Delete scripts/source_scraper.py
Some checks are pending
Generate M3U Playlist with Auto-Organization / build-and-organize (push) Waiting to run
Some checks are pending
Generate M3U Playlist with Auto-Organization / build-and-organize (push) Waiting to run
This commit is contained in:
parent
f9612e7acc
commit
482d4cd7f1
1 changed files with 0 additions and 294 deletions
|
@ -1,294 +0,0 @@
|
||||||
# scripts/source_scraper.py
|
|
||||||
"""
|
|
||||||
Automated Channel Discovery for IPTV Playlist Generator
|
|
||||||
Integrates seamlessly with existing architecture
|
|
||||||
"""
|
|
||||||
|
|
||||||
import requests
|
|
||||||
import json
|
|
||||||
import os
|
|
||||||
import logging
|
|
||||||
from datetime import datetime
|
|
||||||
from typing import List, Dict, Set
|
|
||||||
import re
|
|
||||||
from urllib.parse import urlparse
|
|
||||||
|
|
||||||
class SourceScraper:
|
|
||||||
def __init__(self):
|
|
||||||
self.setup_logging()
|
|
||||||
self.load_config()
|
|
||||||
self.discovered_channels = []
|
|
||||||
self.source_stats = {}
|
|
||||||
|
|
||||||
def setup_logging(self):
|
|
||||||
"""Setup logging consistent with existing system"""
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
self.logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
def load_config(self):
|
|
||||||
"""Load discovery sources configuration"""
|
|
||||||
try:
|
|
||||||
with open('config/discovery_sources.json', 'r') as f:
|
|
||||||
self.config = json.load(f)
|
|
||||||
except FileNotFoundError:
|
|
||||||
# Create default config if it doesn't exist
|
|
||||||
self.config = {
|
|
||||||
"enabled": True,
|
|
||||||
"sources": [
|
|
||||||
{
|
|
||||||
"name": "IPTV-Org Main",
|
|
||||||
"url": "https://raw.githubusercontent.com/iptv-org/iptv/master/streams/",
|
|
||||||
"type": "github_directory",
|
|
||||||
"country_filter": ["us", "uk", "ca", "au"]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "Free-TV Collection",
|
|
||||||
"url": "https://raw.githubusercontent.com/Free-TV/IPTV/master/playlist.m3u8",
|
|
||||||
"type": "m3u_playlist",
|
|
||||||
"quality_filter": ["hd", "fhd", "4k"]
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"filters": {
|
|
||||||
"min_quality": "sd",
|
|
||||||
"exclude_adult": True,
|
|
||||||
"max_channels_per_source": 100,
|
|
||||||
"require_country_detection": False
|
|
||||||
},
|
|
||||||
"rate_limiting": {
|
|
||||||
"delay_between_requests": 1.0,
|
|
||||||
"max_retries": 3
|
|
||||||
}
|
|
||||||
}
|
|
||||||
os.makedirs('config', exist_ok=True)
|
|
||||||
with open('config/discovery_sources.json', 'w') as f:
|
|
||||||
json.dump(self.config, f, indent=2)
|
|
||||||
|
|
||||||
def discover_from_m3u_url(self, source_info: Dict) -> List[str]:
|
|
||||||
"""Discover channels from M3U playlist URL"""
|
|
||||||
try:
|
|
||||||
self.logger.info(f"Discovering from M3U: {source_info['name']}")
|
|
||||||
|
|
||||||
response = requests.get(source_info['url'], timeout=30)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
content = response.text
|
|
||||||
channels = []
|
|
||||||
|
|
||||||
# Parse M3U content
|
|
||||||
lines = content.split('\n')
|
|
||||||
current_extinf = None
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
if line.startswith('#EXTINF:'):
|
|
||||||
current_extinf = line
|
|
||||||
elif line.startswith('http') and current_extinf:
|
|
||||||
# We have a complete channel entry
|
|
||||||
channels.append(f"{current_extinf}\n{line}")
|
|
||||||
current_extinf = None
|
|
||||||
|
|
||||||
self.logger.info(f"Found {len(channels)} channels from {source_info['name']}")
|
|
||||||
self.source_stats[source_info['name']] = len(channels)
|
|
||||||
|
|
||||||
return channels[:self.config['filters']['max_channels_per_source']]
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error discovering from {source_info['name']}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def discover_from_github_directory(self, source_info: Dict) -> List[str]:
|
|
||||||
"""Discover channels from GitHub directory structure"""
|
|
||||||
try:
|
|
||||||
self.logger.info(f"Discovering from GitHub: {source_info['name']}")
|
|
||||||
|
|
||||||
base_url = source_info['url']
|
|
||||||
channels = []
|
|
||||||
|
|
||||||
# Try common country codes from your existing patterns
|
|
||||||
country_codes = source_info.get('country_filter', ['us', 'uk', 'ca', 'de', 'fr'])
|
|
||||||
|
|
||||||
for country in country_codes:
|
|
||||||
try:
|
|
||||||
url = f"{base_url}{country}.m3u"
|
|
||||||
response = requests.get(url, timeout=15)
|
|
||||||
|
|
||||||
if response.status_code == 200:
|
|
||||||
content = response.text
|
|
||||||
lines = content.split('\n')
|
|
||||||
current_extinf = None
|
|
||||||
|
|
||||||
for line in lines:
|
|
||||||
line = line.strip()
|
|
||||||
if line.startswith('#EXTINF:'):
|
|
||||||
current_extinf = line
|
|
||||||
elif line.startswith('http') and current_extinf:
|
|
||||||
channels.append(f"{current_extinf}\n{line}")
|
|
||||||
current_extinf = None
|
|
||||||
|
|
||||||
self.logger.info(f"Found {len(channels)} channels for {country}")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.debug(f"No channels found for {country}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
self.source_stats[source_info['name']] = len(channels)
|
|
||||||
return channels[:self.config['filters']['max_channels_per_source']]
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error discovering from GitHub {source_info['name']}: {e}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
def filter_channels(self, channels: List[str]) -> List[str]:
|
|
||||||
"""Apply quality and content filters"""
|
|
||||||
if not self.config['filters']['exclude_adult']:
|
|
||||||
return channels
|
|
||||||
|
|
||||||
# Load adult keywords from existing config
|
|
||||||
try:
|
|
||||||
with open('config/patterns.json', 'r') as f:
|
|
||||||
patterns = json.load(f)
|
|
||||||
adult_keywords = patterns.get('adult_keywords', [])
|
|
||||||
except:
|
|
||||||
adult_keywords = ['xxx', 'adult', 'porn', '+18']
|
|
||||||
|
|
||||||
filtered = []
|
|
||||||
for channel in channels:
|
|
||||||
# Check if channel contains adult content
|
|
||||||
channel_lower = channel.lower()
|
|
||||||
if not any(keyword in channel_lower for keyword in adult_keywords):
|
|
||||||
filtered.append(channel)
|
|
||||||
|
|
||||||
self.logger.info(f"Filtered {len(channels) - len(filtered)} adult channels")
|
|
||||||
return filtered
|
|
||||||
|
|
||||||
def deduplicate_with_existing(self, new_channels: List[str]) -> List[str]:
|
|
||||||
"""Remove channels that already exist in channels.txt"""
|
|
||||||
if not os.path.exists('channels.txt'):
|
|
||||||
return new_channels
|
|
||||||
|
|
||||||
# Read existing channel URLs
|
|
||||||
existing_urls = set()
|
|
||||||
try:
|
|
||||||
with open('channels.txt', 'r', encoding='utf-8') as f:
|
|
||||||
content = f.read()
|
|
||||||
# Extract URLs from existing channels.txt
|
|
||||||
url_pattern = r'Stream URL\s*=\s*(.+)'
|
|
||||||
existing_urls = set(re.findall(url_pattern, content))
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.warning(f"Could not read existing channels: {e}")
|
|
||||||
|
|
||||||
# Filter out duplicates
|
|
||||||
unique_channels = []
|
|
||||||
for channel in new_channels:
|
|
||||||
lines = channel.split('\n')
|
|
||||||
if len(lines) >= 2:
|
|
||||||
url = lines[1].strip()
|
|
||||||
if url not in existing_urls:
|
|
||||||
unique_channels.append(channel)
|
|
||||||
|
|
||||||
self.logger.info(f"Removed {len(new_channels) - len(unique_channels)} duplicate channels")
|
|
||||||
return unique_channels
|
|
||||||
|
|
||||||
def append_to_bulk_import(self, channels: List[str]):
|
|
||||||
"""Append discovered channels to bulk_import.m3u"""
|
|
||||||
if not channels:
|
|
||||||
self.logger.info("No new channels to add")
|
|
||||||
return
|
|
||||||
|
|
||||||
# Read existing bulk_import content
|
|
||||||
existing_content = ""
|
|
||||||
if os.path.exists('bulk_import.m3u'):
|
|
||||||
with open('bulk_import.m3u', 'r', encoding='utf-8') as f:
|
|
||||||
existing_content = f.read().strip()
|
|
||||||
|
|
||||||
# If file is empty or only has header, start fresh
|
|
||||||
if not existing_content or existing_content == '#EXTM3U':
|
|
||||||
existing_content = '#EXTM3U'
|
|
||||||
|
|
||||||
# Append new channels
|
|
||||||
with open('bulk_import.m3u', 'w', encoding='utf-8') as f:
|
|
||||||
f.write(existing_content)
|
|
||||||
if not existing_content.endswith('\n'):
|
|
||||||
f.write('\n')
|
|
||||||
f.write('\n'.join(channels))
|
|
||||||
f.write('\n')
|
|
||||||
|
|
||||||
self.logger.info(f"Added {len(channels)} new channels to bulk_import.m3u")
|
|
||||||
|
|
||||||
def generate_discovery_report(self):
|
|
||||||
"""Generate discovery session report"""
|
|
||||||
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
|
||||||
report_path = f"reports/daily/discovery_report_{timestamp}.md"
|
|
||||||
|
|
||||||
os.makedirs('reports/daily', exist_ok=True)
|
|
||||||
|
|
||||||
total_discovered = sum(self.source_stats.values())
|
|
||||||
|
|
||||||
with open(report_path, 'w', encoding='utf-8') as f:
|
|
||||||
f.write(f"# Channel Discovery Report\n")
|
|
||||||
f.write(f"**Generated:** {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n")
|
|
||||||
f.write(f"## Summary\n")
|
|
||||||
f.write(f"- **Total Channels Discovered:** {total_discovered}\n")
|
|
||||||
f.write(f"- **Sources Checked:** {len(self.config['sources'])}\n")
|
|
||||||
f.write(f"- **Active Sources:** {len(self.source_stats)}\n\n")
|
|
||||||
f.write(f"## Source Breakdown\n")
|
|
||||||
|
|
||||||
for source_name, count in self.source_stats.items():
|
|
||||||
f.write(f"- **{source_name}:** {count} channels\n")
|
|
||||||
|
|
||||||
f.write(f"\n## Configuration\n")
|
|
||||||
f.write(f"- **Max per source:** {self.config['filters']['max_channels_per_source']}\n")
|
|
||||||
f.write(f"- **Adult filter:** {'Enabled' if self.config['filters']['exclude_adult'] else 'Disabled'}\n")
|
|
||||||
f.write(f"- **Quality filter:** {self.config['filters']['min_quality']}\n")
|
|
||||||
f.write(f"\n---\n*Auto-generated by Source Scraper*\n")
|
|
||||||
|
|
||||||
self.logger.info(f"Discovery report saved: {report_path}")
|
|
||||||
|
|
||||||
def run_discovery(self):
|
|
||||||
"""Main discovery process"""
|
|
||||||
if not self.config['enabled']:
|
|
||||||
self.logger.info("Discovery is disabled in configuration")
|
|
||||||
return
|
|
||||||
|
|
||||||
self.logger.info("=== Starting Channel Discovery ===")
|
|
||||||
|
|
||||||
all_discovered = []
|
|
||||||
|
|
||||||
for source in self.config['sources']:
|
|
||||||
try:
|
|
||||||
if source['type'] == 'm3u_playlist':
|
|
||||||
channels = self.discover_from_m3u_url(source)
|
|
||||||
elif source['type'] == 'github_directory':
|
|
||||||
channels = self.discover_from_github_directory(source)
|
|
||||||
else:
|
|
||||||
self.logger.warning(f"Unknown source type: {source['type']}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
all_discovered.extend(channels)
|
|
||||||
|
|
||||||
# Rate limiting
|
|
||||||
import time
|
|
||||||
time.sleep(self.config['rate_limiting']['delay_between_requests'])
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
self.logger.error(f"Error processing source {source['name']}: {e}")
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Apply filters
|
|
||||||
filtered_channels = self.filter_channels(all_discovered)
|
|
||||||
unique_channels = self.deduplicate_with_existing(filtered_channels)
|
|
||||||
|
|
||||||
# Add to bulk import
|
|
||||||
self.append_to_bulk_import(unique_channels)
|
|
||||||
|
|
||||||
# Generate report
|
|
||||||
self.generate_discovery_report()
|
|
||||||
|
|
||||||
self.logger.info(f"=== Discovery Complete: {len(unique_channels)} new channels added ===")
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
scraper = SourceScraper()
|
|
||||||
scraper.run_discovery()
|
|
Loading…
Add table
Add a link
Reference in a new issue