#!/usr/bin/env perl
use strict;
use warnings;
use autodie qw(:all);

# These work
# curl https://event.example.com | validate-schema --dynamic
# validate-schema --dynamic --file https://event.example.com

use DateTime::Format::ISO8601;
use Mojo::DOM;
use JSON::MaybeXS qw(decode_json encode_json);
use Encode qw(decode);
use File::Spec;
use Getopt::Long;
use LWP::UserAgent;
use Locale::Country;
use Scalar::Util 'looks_like_number';
use URI;

# Command-line options
my $file;
my $github_mode = 0;
my $dynamic_mode = 0;

GetOptions(
	'file=s' => \$file,
	'github' => \$github_mode,
	'dynamic' => \$dynamic_mode,
) or die "Usage: $0 [--file input.html] [--github] [--dynamic].\nIf file is not given, use STDIN.\nDoes the right thing if file starts with http:// or https://";

# die "Usage: $0 --file input.html [--github] [--dynamic]" unless $file;

my $cache_dir = $ENV{'CACHE_DIR'};

if(!defined($cache_dir)) {
	require File::HomeDir;
	File::HomeDir->import();

	# $cache_dir = File::Spec->catfile(File::Spec->tmpdir(), 'cache', 'http-cache-transparent');
	$cache_dir = File::Spec->catfile(File::HomeDir->my_home(), '.cache');
}

if(!-d $cache_dir) {
	mkdir $cache_dir, 02755 || die "$cache_dir: $@";
}
my $exit_code = 0;

# Global array to collect SARIF results during validation.
my @sarif_results;

# Global hash to store dynamic vocabulary (if requested)
my %dynamic_schema;	# Global variable to store class definitions
my %dynamic_properties;	# Global variable to store property definitions

#-----------------------------------------------------------------
# Expanded built‑in schema definition.
# Each type defines:
#  - required: property names that must exist.
#  - nested: properties that are nested objects (validated using another type).
#  - property_validations: custom validations (e.g., checking that 'startDate' is a valid date).
#  - enum: for properties that must match one of a given set.
my %schema = (
	MusicEvent => {
		required => [qw(name startDate location)],
		nested => { location => 'PostalAddress' },
		property_validations => {
			startdate => \&is_valid_datetime,
		},
	},
	PostalAddress => {
		required => [qw(addressCountry addressLocality)],
		enum => {
			# Allowed country values for demonstration. (Note: these are sample strings.)
			addressCountry => [qw(United_States US Canada United_Kingdom)],
		},
	},
	PerformingGroup => {
		required => [qw(name)],
	},
	Person => {
		required => [qw(name)],
	},
);

#-----------------------------------------------------------------
# Read the input file in raw mode and decode from Windows-1252.
my $html;
if(!defined($file)) {
	while(my $line = <>) {
		$html .= $line;
	}
} elsif($file =~ m{^https?://}) {
	# It's a URL, fetch it using LWP::UserAgent
	my $ua = LWP::UserAgent->new(timeout => 15);
	my $response = $ua->get($file);

	if(!$response->is_success()) {
		die "Failed to fetch URL '$file': ", $response->status_line();
	}
	$html = decode('Windows-1252', $response->decoded_content());
} else {
	# Local file
	open my $fh, '<:raw', $file or die "Cannot open file '$file': $!";
	local $/;
	my $raw_content = <$fh>;
	close $fh;
	$html = decode('Windows-1252', $raw_content);
}

#-----------------------------------------------------------------
# Parse the HTML using Mojo::DOM (tolerates malformed markup).
my $dom = Mojo::DOM->new($html);
my $scripts = $dom->find('script[type="application/ld+json"]');

if($dynamic_mode && $scripts->each()) {
	%dynamic_schema = load_dynamic_vocabulary();
	unless (%dynamic_schema) {
		warn 'Dynamic vocabulary could not be loaded';
	}
}
foreach my $script ($scripts->each()) {
	my $json_text = $script->all_text;
	$json_text =~ s/^\s+|\s+$//g;
	next unless length $json_text;

	my $data = eval { decode_json($json_text) };
	if ($@ || !$data) {
		warn "Invalid JSON: $@";
		next;
	}

	unless ($github_mode) {
		print "Found Schema.org block:\n";
	}
	# Process the block (array or single object)
	if (ref $data eq 'ARRAY') {
		foreach my $entity (@$data) {
			validate_entity($entity, 0, '');
		}
	} else {
		validate_entity($data, 0, '');
	}
}

#-----------------------------------------------------------------
# If in GitHub mode, output SARIF report.
if ($github_mode) {
	my $sarif_output = {
		version => '2.1.0',
		runs => [
			{
				tool => {
					driver => {
						name => 'Schema.org Validator',
						informationUri => 'https://schema.org',
						version => '1.2',
						rules => [
							{ id => 'SCHEMA000', name => 'Missing @type' },
							{ id => 'SCHEMA001', name => 'Missing required property' },
							{ id => 'SCHEMA003', name => "Invalid property format" },
							{ id => 'SCHEMA004', name => "Unexpected enumerated value" },
							{ id => 'SCHEMA005', name => "Invalid performer type" },
							{ id => 'SCHEMA002', name => "Unknown type" },
						],
					},
				},
				results => \@sarif_results,
			},
		],
	};
	open my $sfh, '>', 'schema_validation.sarif' or die "Can't open output SARIF file: $!";
	print $sfh encode_json($sarif_output);
	close $sfh;
	print "SARIF output written to schema_validation.sarif\n";
}

exit $exit_code;

#-----------------------------------------------------------------
# Helper: push_validation
# Centralizes error/warning message production.
sub push_validation {
	my ($rule, $message, $location) = @_;
	if ($github_mode) {
		push @sarif_results, {
			ruleId => $rule,
			level => 'error',
			message => { text => $message },
			locations => [{
				physicalLocation => {
					artifactLocation => { uri => $file }
				}
			}],
		};
	} else {
		print "✗ [$rule] $message at $location\n";
	}
	# map individual rule names to numeric exit codes
	my %exit_map = (
		'SCHEMA000' => 2,	# missing/invalid @type
		'SCHEMA001' => 3,	# missing required property
		'SCHEMA002' => 4,	# unknown type
		'SCHEMA003' => 5,	# enum or format violation
		'SCHEMA004' => 6,	# nested‐property type mismatch
		'SCHEMA005' => 7,	# cross‐field validation failure
		'SCHEMA_DYN0' => 10,	# no dynamic definition
		'SCHEMA_DYN1' => 11,	# nested‐object missing @type
		'SCHEMA_DYN2' => 12,	# nested‐object unrecognized type
		'SCHEMA_DYNFMT' => 13,	# property format mismatch
		'SCHEMA_CTRY' => 20,	# invalid country code
		# …add your other rule→code mappings here…
	);
	$exit_code = $exit_map{$rule} // 99;
}

#-----------------------------------------------------------------
# Helper: is_valid_datetime
# Validates that the given value matches YYYY-MM-DD or YYYY-MM-DDTHH:MM(:SS)?
sub is_valid_datetime {
	my $val = shift;
	return $val =~ /^\d{4}-\d{2}-\d{2}(?:[T ]\d{2}:\d{2}(?::\d{2})?)?$/;
}

#-----------------------------------------------------------------
# Cross-field validation for MusicEvent performer.
# Ensures that if a MusicEvent defines a 'performer', its @type must be either PerformingGroup or Person.
sub validate_performer {
	my ($entity, $currentPath) = @_;
	if (exists $entity->{performer}) {
		my $perf = $entity->{performer};
		my @perfs = (ref($perf) eq 'ARRAY') ? @$perf : ($perf);
		foreach my $p (@perfs) {
			if (ref $p eq 'HASH') {
				unless ($p->{'@type'} && ($p->{'@type'} =~ /^(PerformingGroup|Person)$/)) {
					push_validation("SCHEMA005", "Performer must be of type PerformingGroup or Person", "$currentPath->performer");
				}
			} else {
				push_validation("SCHEMA005", "Performer invalid format: expected JSON object", "$currentPath->performer");
			}
		}
	}
}

#-----------------------------------------------------------------
# Recursive function: validate_entity
# Validates a JSON-LD block against the built‑in schema (and optionally dynamic schema).
sub validate_entity {
	my ($entity, $depth, $currentPath) = @_;

	$depth //= 0;
	$currentPath = defined $currentPath ? $currentPath : '';
	my $indent = ' ' x ($depth * 2);

	# Every valid block should at least have an '@type'
	unless (ref $entity eq 'HASH' && exists $entity->{'@type'}) {
		push_validation("SCHEMA000", "Missing or invalid \@type", ($currentPath || 'root'));
		return;
	}

	my $type = $entity->{'@type'};
	unless ($github_mode) {
		print "${indent}• Type: $type at ", ($currentPath || 'root'), "\n";
	}

	if (exists $schema{$type}) {
		my $rules = $schema{$type};
		# Check required properties.
		for my $req (@{ $rules->{required} }) {
			if((!exists $entity->{$req}) && (!exists $entity->{lc($req)})) {
				push_validation('SCHEMA001', "Missing required property '$req' for type '$type'", ($currentPath || 'root'));
			}
		}

		# Check enumerated values.
		if ($rules->{enum}) {
			for my $prop (keys %{ $rules->{enum} }) {
				if (exists $entity->{$prop}) {
					my @allowed = @{ $rules->{enum}->{$prop} };
					unless (grep { $_ eq $entity->{$prop} } @allowed) {
						push_validation("SCHEMA004", "Unexpected value '$entity->{$prop}' for property '$prop'", ($currentPath || 'root'));
					}
				}
			}
		}

		# Custom property validations.
		if ($rules->{property_validations}) {
			for my $prop (keys %{ $rules->{property_validations} }) {
				if (exists $entity->{$prop}) {
					my $validator = $rules->{property_validations}->{$prop};
					unless ($validator->($entity->{$prop})) {
						push_validation("SCHEMA003", "Invalid format for property '$prop'", ($currentPath || 'root'));
					}
				}
			}
		}

		# Validate nested objects.
		if ($rules->{nested}) {
			for my $prop (keys %{ $rules->{nested} }) {
				if (exists $entity->{$prop}) {
					my $child = $entity->{$prop};
					my $childPath = $currentPath ? "$currentPath->$prop" : $prop;
					if (ref $child eq 'ARRAY') {
						for my $i (0 .. $#$child) {
							my $itemPath = $childPath . "[$i]";
							validate_entity($child->[$i], $depth + 1, $itemPath);
						}
					} elsif (ref $child eq 'HASH') {
						validate_entity($child, $depth + 1, $childPath);
					}
				}
			}
		}

		# Cross-field check: validate MusicEvent performer.
		if ($type eq 'MusicEvent') {
			validate_performer($entity, $currentPath || 'root');
		}

		# ---- Dynamic Validation: Additional checks from Schema.org vocabulary ----
		if ($dynamic_mode) {
			if (exists $dynamic_schema{$type}) {
				dynamic_validate_entity($entity, $type, ($currentPath || 'root'), $indent);
			} else {
				push_validation("SCHEMA002", "No dynamic vocabulary definition for type '$type'", ($currentPath || 'root'));
			}
		}

		unless ($github_mode) {
			print "${indent}✓ $type passes basic validation at ", ($currentPath || 'root'), "\n\n";
		}
	} elsif($dynamic_mode && exists($dynamic_schema{$type})) {
		dynamic_validate_entity($entity, $type, ($currentPath || 'root'), $indent);
	} else {
		push_validation('SCHEMA002', "Unknown type '$type'. Skipping detailed rules check.", ($currentPath || 'root'));
	}
}

#-----------------------------------------------------------------

# Helper: Validate a property’s format against its expected range from the dynamic vocabulary.
sub validate_property_format
{
	my ($prop_name, $value, $prop_def, $location) = @_;

	# Attempt to extract the expected type(s) from rangeIncludes.
	# my $range = $prop_def->{'http://schema.org/rangeIncludes'}
		# // $prop_def->{'schema:rangeIncludes'}
		# // $prop_def->{'rangeIncludes'};
	my ($key) = grep { /(?:schema:)?rangeIncludes$/ } keys %{$prop_def};
	my $range = $key ? $prop_def->{$key} : undef;

	return unless defined $range;

	# Gather expected types (short names, e.g. "URL", "Text", "DateTime")
	my @expected = ref $range eq 'ARRAY' ? @$range : ($range);
	@expected = map { ref($_) ? ($_->{'@id'}||'') : $_ } @expected;
	@expected = map { s{^.*/}{}r } @expected;	# strip URI to short name

	my $valid = 0;
	foreach my $type (@expected) {
		next unless $type;
		if(($type eq 'Text') || ($type eq 'schema:Text')) {
			# anything goes
			$valid = 1; last;
		} elsif ($type eq 'URL') {
			my $uri = URI->new($value);
			if ($uri->scheme && $uri->host) {
				$valid = 1; last;
			}
		} elsif ($type eq 'Email') {
			# simple regex; swap in Email::Valid->address($value) if you like
			if ($value =~ /^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$/) {
				$valid = 1; last;
			}
		} elsif(($type eq 'Date') || ($type eq 'schema:Date')) {
			# if($value =~ /^\d{4}-\d{2}-\d{2}$/) {
			if($value && is_valid_iso8601($value)) {
				$valid = 1; last;
			}
		} elsif ($type eq 'DateTime') {
			if ($value =~ /^\d{4}-\d{2}-\d{2}[T ]\d{2}:\d{2}(?::\d{2})?$/) {
				$valid = 1; last;
			}
		} elsif ($type eq 'Time') {
			if ($value =~ /^\d{2}:\d{2}(?::\d{2})?$/) {
				$valid = 1; last;
			}
		} elsif ($type eq 'Number' || $type eq 'Integer') {
			if (defined $value && $value =~ /^\d+(\.\d+)?$/) {
				$valid = 1; last;
			}
		} elsif ($type eq 'Boolean') {
			if ($value =~ /^(?:true|false)$/i) {
				$valid = 1; last;
			}
		} elsif(($type eq 'Country') || ($type eq 'schema:Country')) {
			# If this property is addressCountry, require 2-letter ISO code
			my $code = uc($value);
			if(Locale::Country::code2country($code)) {
				$valid = 1; last;
			}
		} elsif(($type eq 'Gender') || ($type eq 'schema:GenderType')) {
			if(($value eq 'Male') || ($value eq 'Female')) {
				$valid = 1; last;
			}
		} else {
			# Add more specialized types here as needed...
			# TODO:
			#	Adding checks for GeoCoordinates (latitude/longitude ranges).
			#	Validating PostalCode with country-specific regexes.
			#	Checking Language codes (ISO 639-1).
			#	Incorporating ContactPoint types (telephone patterns).
			warn "Add validation for '$type', needed for '$prop_name'";
		}
	}

	unless($valid) {
		if($value) {
			push_validation('SCHEMA_DYNFMT',
				"Value '$value' for property '$prop_name' does not conform to any expected format.",
				$location
			);
		} else {
			push_validation('SCHEMA_DYNFMT',
				"Unexpected empty value for property '$prop_name'.",
				$location
			);
		}
	}
}

# Extended dynamic validation function.
# Now, in addition to checking nested objects for a valid @type,
# this function also examines scalar values (or arrays of scalars)
# and validates their format via dynamic property definitions.
sub dynamic_validate_entity
{
	my ($entity, $type, $currentPath, $indent) = @_;

	if (exists $dynamic_schema{$type}) {
		my $def = $dynamic_schema{$type};
		if (my $comment = $def->{'rdfs:comment'} // $def->{'http://www.w3.org/2000/01/rdf-schema#comment'}) {
			my $cmt = ref($comment) eq 'ARRAY' ? $comment->[0] : $comment;
			unless ($github_mode) {
				print "${indent}Dynamic info for $type: $cmt\n";
			}
		}
		unless ($github_mode) {
			print "${indent}Dynamic validations enabled for $type. Checking property formats...\n";
		}

		# Examine every property in the entity (except for metadata keys).
		foreach my $prop (keys %$entity) {
			next if $prop eq '@type' or $prop =~ /^@(context|id)$/;
			my $val = $entity->{$prop};
			my $childPath = $currentPath ? "$currentPath->$prop" : $prop;

			if (ref($val) eq 'HASH') {
				# Nested object: it must have an @type and be recognized.
				unless (exists $val->{'@type'}) {
					push_validation("SCHEMA_DYN1",
						"Missing nested \@type for property '$prop'",
						$childPath
					);
				} else {
					my $nested_type = $val->{'@type'};
					unless (exists $dynamic_schema{$nested_type}) {
						push_validation("SCHEMA_DYN2",
							"Unrecognized nested type '$nested_type' for property '$prop'",
							$childPath
						);
					}
				}
			}
			elsif (ref($val) eq 'ARRAY') {
				for my $i (0 .. $#$val) {
					my $item = $val->[$i];
					my $itemPath = $childPath . "[$i]";
					if (ref($item) eq 'HASH') {
						unless (exists $item->{'@type'}) {
							push_validation("SCHEMA_DYN1",
								"Missing nested \@type in array element for property '$prop'",
								$itemPath
							);
						} else {
							my $nested_type = $item->{'@type'};
							unless (exists $dynamic_schema{$nested_type}) {
								push_validation("SCHEMA_DYN2",
									"Unrecognized nested type '$nested_type' in array for property '$prop'",
									$itemPath
								);
							}
						}
					} else {
						# Scalar array element: validate format if a dynamic property definition exists.
						if (exists $dynamic_properties{$prop}) {
							my $prop_def = $dynamic_properties{$prop};
							validate_property_format($prop, $item, $prop_def, $itemPath);
						}
					}
				}
			} else {
				# Scalar value: validate via dynamic property definition if available.
				if(exists $dynamic_properties{$prop}) {
					my $prop_def = $dynamic_properties{$prop};
					validate_property_format($prop, $val, $prop_def, $childPath);
				}
			}
		}
	} else {
		push_validation('SCHEMA_DYN0',
			"No dynamic vocabulary definition for type '$type'",
			($currentPath || 'root')
		);
	}
}

# Loads the dynamic Schema.org vocabulary.
# It now handles both compact and expanded keys.
sub load_dynamic_vocabulary
{
	my $cache_file = File::Spec->catfile($cache_dir, 'schema_validator');
	my $cache_duration = 86400;	# Cache expires in 1 day (86400 seconds)
	my $content;
	my $use_cache = 0;

	if(!-d $cache_file) {
		mkdir $cache_file, 02755 || die "$cache_file: $@";
	}
	$cache_file = File::Spec->catfile($cache_file, 'schemaorg_dynamic_vocabulary.jsonld');
	if (-e $cache_file) {
		my $mtime = (stat($cache_file))[9];
		if(time - $mtime < $cache_duration) {
			$use_cache = 1;
		}
	}

	if($use_cache) {
		# Read from the cache file
		open my $cfh, '<', $cache_file or warn "Could not open cache file $cache_file: $!";
		{
			local $/;	# Slurp mode
			$content = <$cfh>;
		}
		close $cfh;
	} else {
		# Download the vocabulary from Schema.org
		if(eval { require HTTP::Cache::Transparent; }) {
			HTTP::Cache::Transparent->import();
			my $dir = File::Spec->catfile($cache_dir, 'http-cache-transparent');
			HTTP::Cache::Transparent::init({
				BasePath => $dir,
				# Verbose => $opts{'v'} ? 1 : 0,
				Verbose => 0,
				NoUpdate => 60 * 60 * 24,
				MaxAge => 30 * 24
			}) || die "$0: $cache_dir: $!";
		} else {
			print "Consider installing HTTP::Cache::Transparent to reduce downloads\n";
		}

		my $url = 'https://schema.org/version/latest/schemaorg-current-https.jsonld';
		my $ua = LWP::UserAgent->new(timeout => 30);
		my $res = $ua->get($url);

		unless($res->is_success()) {
			warn "Failed to fetch dynamic vocabulary from $url: ", $res->status_line();
			return ();
		}
		$content = $res->decoded_content;
		# Write the downloaded content to the cache file.
		open my $cfh, '>', $cache_file or warn "Could not write to cache file $cache_file: $!";
		print $cfh $content;
		close $cfh;
	}

	my $data = eval { decode_json($content) };
	if ($@) {
		warn "Failed to parse dynamic vocabulary JSON: $@";
		return ();
	}

	my %class_vocab;
	my %prop_vocab;

	if (exists $data->{'@graph'} && ref($data->{'@graph'}) eq 'ARRAY') {
		for my $item (@{ $data->{'@graph'} }) {
			next unless exists $item->{'@type'};
			my $item_type = $item->{'@type'};

			# ──────────────── CLASS DEFINITIONS ─────────────────
			my $is_class = 0;
			if (ref $item_type eq 'ARRAY') {
				$is_class = grep { $_ eq 'rdfs:Class' } @{$item_type};
			} else {
				$is_class = ($item_type eq 'rdfs:Class');
			}
			if ($is_class) {
				# 1) Human-readable label
				my $label = $item->{'rdfs:label'}
						 // $item->{'http://www.w3.org/2000/01/rdf-schema#label'};
				next unless $label;
				$label = ref($label) eq 'ARRAY' ? $label->[0] : $label;
				$class_vocab{$label} = $item;

				# 2) Short type name from @id (e.g. "MusicGroup")
				if (my $full_id = $item->{'@id'}) {
					if ($full_id =~ m{([^/:]+)$}) {
						my $type_name = $1;
						$class_vocab{$type_name} = $item;
					}
				}
			}

			# ───────────── PROPERTY DEFINITIONS ────────────────
			my $is_prop = 0;
			if (ref $item_type eq 'ARRAY') {
				$is_prop = grep { $_ eq 'rdf:Property' } @{$item_type};
			}
			else {
				$is_prop = ($item_type eq 'rdf:Property');
			}
			if ($is_prop) {
				# 1) Human-readable label
				my $label = $item->{'rdfs:label'}
						 // $item->{'http://www.w3.org/2000/01/rdf-schema#label'};
				next unless $label;
				$label = ref($label) eq 'ARRAY' ? $label->[0] : $label;
				$prop_vocab{$label} = $item;

				# 2) Short property name from @id (e.g. "startDate")
				if (my $full_id = $item->{'@id'}) {
					if ($full_id =~ m{([^/:]+)$}) {
						my $prop_name = $1;
						$prop_vocab{$prop_name} = $item;
					}
				}
			}
		}
	} else {
		warn "No '\@graph' key found in the vocabulary JSON.";
	}

	# Assign the populated hashes to the global variables.
	%dynamic_schema = %class_vocab;
	%dynamic_properties = %prop_vocab;

	warn 'Dynamic vocabulary loaded: ', scalar(keys %dynamic_schema),
		' classes and ', scalar(keys %dynamic_properties), ' properties found.';

	return %dynamic_schema;
}

sub is_valid_iso8601
{
	my $string = shift;

	eval { DateTime::Format::ISO8601->parse_datetime($string) };

	if($@) {
		return 0;	# Invalid format
	};
	return 1;
}
