Example - wikipedia images downloader

HTML output retrieves first 10 available images from en.wikipedia.org. Output is refreshing each 15 minutes from cron.

 

Outputs:

Source URL;Image URL;Name
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/2/2d/Bayview_Park_Canadian_exiles_memorial_%281%29.jpg/120px-Bayview_Park_Canadian_exiles_memorial_%281%29.jpg";"120px-Bayview_Park_Canadian_exiles_memorial_%281%29.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/a/ae/Eso1737a.jpg/150px-Eso1737a.jpg";"150px-Eso1737a.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/c/cd/Der_Durchbruch_der_Schutztruppe_Deutsch-Ostafrika_%C3%BCber_den_Rowuma_MItte_November_1917._Darstellung_von_Carl_Arriens.jpg/120px-Der_Durchbruch_der_Schutztruppe_Deutsch-Ostafrika_%C3%BCber_den_Rowuma_MItte_November_1917._Darstellung_von_Carl_Arriens.jpg";"120px-Der_Durchbruch_der_Schutztruppe_Deutsch-Ostafrika_%C3%BCber_den_Rowuma_MItte_November_1917._Darstellung_von_Carl_Arriens.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/8/87/Amanhecer_no_Hercules_--.jpg/380px-Amanhecer_no_Hercules_--.jpg";"380px-Amanhecer_no_Hercules_--.jpg"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/en/thumb/4/4a/Commons-logo.svg/31px-Commons-logo.svg.png";"31px-Commons-logo.svg.png"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/3/3d/Mediawiki-logo.png/35px-Mediawiki-logo.png";"35px-Mediawiki-logo.png"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/7/75/Wikimedia_Community_Logo.svg/35px-Wikimedia_Community_Logo.svg.png";"35px-Wikimedia_Community_Logo.svg.png"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/f/fa/Wikibooks-logo.svg/35px-Wikibooks-logo.svg.png";"35px-Wikibooks-logo.svg.png"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/f/ff/Wikidata-logo.svg/47px-Wikidata-logo.svg.png";"47px-Wikidata-logo.svg.png"
"http://en.wikipedia.org/wiki/Main_Page";"https://upload.wikimedia.org/wikipedia/commons/thumb/2/24/Wikinews-logo.svg/51px-Wikinews-logo.svg.png";"51px-Wikinews-logo.svg.png"

Source code of script:

# File: imgdownloader_main.w
# Name: Image Downloader
# Description: Script opens defined URL, finds first 10 available images, downloads them into folder 'images' and saves basic information into CSV file.
# Input: URL
# Output format: CSV file, XLS file, images
# Output fields: Source URL, Image URL, Name

#<Logger File>
#	Global
#	FileName imgdownloader_log.log
#	Level debug
#</Logger>

<Section>
	Name imgdownloader_main
	
	Define $output_file_csv imgdownloader_output.csv
	Define $output_file_xls imgdownloader_output.xls
	
	#define website url
	Define $url http://en.wikipedia.org/wiki/Main_Page
	
	
	
	#delete old output CSV file
	<Action Exec>
		cmd rm -f *.csv
	</Action>
		
	
	
	# downloading the content of URL
	<Action ContentURL>
		URL {$url} 
		RemoveNewLine
	</Action>
		
	<Section While>
		MaxIterations 10
		
		# two types of patterns
		<Section Or>
			NoContext
			
			<Pattern>
				RegExp <img{:re(.*?)}src="{$url_img:re([^"]*)}"
				Trim
				Compact
				MultiLine
			</Pattern>
			
			<Pattern>
				RegExp <img{:re(.*?)}src={$url_img:re([^ ]*)} 
				Trim
				Compact
				MultiLine
			</Pattern>
		</Section>
		
		# relative address -> absolute address
		<Action Php>
			Code if (!eregi("^http",$context->getVariable('$url_img'))) $context->setVariable('$url_img', 'https:'.$context->getVariable('$url_img'));
		</Action>
		
		# image name
		<Action Php>
			Code $context->setVariable('$name_img',pathinfo($context->getVariable('$url_img'), PATHINFO_FILENAME).".".pathinfo($context->getVariable('$url_img'), PATHINFO_EXTENSION));
		</Action>
		
		# downloading image
		<Action URLToFile>
			URL {$url_img}
			FileName images/{$name_img}
		</Action>
		
		# saving basic information into SCV file
		<Action SaveCSV>
			FileName {$output_file_csv}
			Separator ;
			Column $url, Source URL
			Column $url_img, Image URL
			Column $name_img, Name
		</Action> 
	</Section>
	
	
	
	#delete old output XLS file
	<Action Exec>
		cmd rm -f *.xls
	</Action>
	
	#convert data from csv file into xls file
	<Action ConvertToXLS>
		InputFile {$output_file_csv}
		OutputFile {$output_file_xls}
		Separator ;
	</Action>
</Section>

Main imgdownloader_main